diff --git a/gfp12.go b/gfp12.go
index 250e65e..c2332cb 100644
--- a/gfp12.go
+++ b/gfp12.go
@@ -158,6 +158,52 @@ func (c *gfP12) Exp(a *gfP12, power *big.Int) *gfP12 {
 	return c
 }
 
+// "New software speed records for cryptographic pairings"
+// Section 3.3, Final exponentiation - 
+// Algorithm 2 Exponentiation by v = 1868033.
+// https://cryptojedi.org/papers/dclxvi-20100714.pdf
+func (e *gfP12) powToVCyclo6(a *gfP12) *gfP12 {
+	// The sequence of 21 special squarings and 4 multiplications
+	t0, t1, t2 := &gfP12{}, &gfP12{}, &gfP12{}
+	
+	t0.SquareCyclo6(a)
+	t0.SquareCyclo6(t0)
+	t0.SquareCyclo6(t0) // t0 = a ^ 8
+	t1.SquareCyclo6(t0)
+	t1.SquareCyclo6(t1)
+	t1.SquareCyclo6(t1) // t1 = a ^ 64
+	t2.Conjugate(t0)    // t2 = a ^ -8
+	t2.Mul(t2, a)       // t2 = a ^ -7
+	t2.Mul(t2, t1)      // t2 = a ^ 57
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2) // t2 = a ^ (2^7 * 57) = a ^ 7296
+	t2.Mul(t2, a)       // t2 = a ^ 7297
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2)
+	t2.SquareCyclo6(t2) // t2 = a ^ (7297 * 256) = a ^ 1868032
+	e.Mul(t2, a)
+	return e
+}
+
+// PowToUCyclo6 is used in final exponentiation after easy part(a ^ ((p^2 + 1)(p^6-1))).
+// Due to  u = v^3, so a^u can be implemented as three [powToVCyclo6].
+func (e *gfP12) PowToUCyclo6(a *gfP12) *gfP12 {
+	e.powToVCyclo6(a)
+	e.powToVCyclo6(e)
+	e.powToVCyclo6(e)
+	return e
+}
+
 func (e *gfP12) Square(a *gfP12) *gfP12 {
 	// Complex squaring algorithm
 	v0 := (&gfP6{}).Mul(&a.x, &a.y)
@@ -174,6 +220,105 @@ func (e *gfP12) Square(a *gfP12) *gfP12 {
 	return e
 }
 
+// SquareCyclo6 is used in final exponentiation after easy part(a ^ ((p^2 + 1)(p^6-1))).
+// Note that after the easy part of the final exponentiation, 
+// the resulting element lies in cyclotomic subgroup. 
+// "New software speed records for cryptographic pairings"
+// Section 3.3, Final exponentiation
+// https://cryptojedi.org/papers/dclxvi-20100714.pdf
+// The fomula reference:
+// Granger/Scott (PKC2010). 
+// Section 3.2
+// https://eprint.iacr.org/2009/565.pdf
+func (e *gfP12) SquareCyclo6(a *gfP12) *gfP12 {
+	// f = xω + y = (h2τ² + h1τ + h0)ω + (g2τ² + g1τ + g0) = h2ω^5 + g2ω^4 + h1 ω^3 + g1ω² + h0ω + g0
+	// we can also represets f as a cubic over a quadartic extension:
+	// Fp4[s]=Fp2[s]/(s^2-ξ), Fp12[t]=Fp4[t]/(t^3-s), s^2=ξ, t^3=s then
+	// f = ct² + bt + a = (c0 + c1s)t² + (b0 + b1s)t + (a0 + a1s) = c1t^5 + b1t^4 + a1t^3 + c0t^2 + b0t + a0
+	// both extensions are based on Fp2, so we got t^6 = ω^6 = ξ, and 
+	// a0 = g0, a1 = h1, b0 = h0, b1 = g2, c0 = g1, c1 = h2
+	// g0 = a.y.z, h1 = a.x.y, h0 = a.x.z, g2 = a.y.x, g1 = a.y.y, h2 = a.x.x
+	tmp := &gfP12{}
+
+	f02 := &tmp.y.x
+	f01 := &tmp.y.y
+	f00 := &tmp.y.z
+	f12 := &tmp.x.x
+	f11 := &tmp.x.y
+	f10 := &tmp.x.z
+
+	t00, t01, t02, t10, t11, t12 := &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{}, &gfP2{}
+
+	gfP4Square(t11, t00, &a.x.y, &a.y.z) // (t00 + t11s) = (a0 + a1s)^2 = a²
+	gfP4Square(t12, t01, &a.y.x, &a.x.z) // (t01 + t12s) = (b0 + b1s)^2 = b²
+	gfP4Square(t02, t10, &a.x.x, &a.y.y) // (t10 + t02s) = (c0 + c1s)^2 = c²
+
+	// t02 + t10s = (t10 + t02s)s
+	f00.MulXi(t02)
+	t02.Set(t10)
+	t10.Set(f00)
+
+	// triples
+	// (t00 + t11s) = 3a²
+	// (t01 + t12s) = 3b²
+	// (t02 + t10s) = 3c²s
+	f00.Add(t00, t00)
+	t00.Add(f00, t00)
+	f00.Add(t01, t01)
+	t01.Add(f00, t01)
+	f00.Add(t02, t02)
+	t02.Add(f00, t02)
+	f00.Add(t10, t10)
+	t10.Add(f00, t10)
+	f00.Add(t11, t11)
+	t11.Add(f00, t11)
+	f00.Add(t12, t12)
+	t12.Add(f00, t12)
+
+	// (f00 + f11s) = -2Conjugate(a0 + a1s) = -2Conjugate(a)
+	// (f01 + f12s) = -2Conjugate(c0 + c1s) = -2Conjugate(c)
+	// (f02 + f10s) = 2Conjugate(b0 + b1s) = 2Conjugate(b)
+	f00.Add(&a.y.z, &a.y.z)
+	f00.Neg(f00)
+	f01.Add(&a.y.y, &a.y.y)
+	f01.Neg(f01)
+	f02.Add(&a.y.x, &a.y.x)
+	f02.Neg(f02)
+	f10.Add(&a.x.z, &a.x.z)
+	f11.Add(&a.x.y, &a.x.y)
+	f12.Add(&a.x.x, &a.x.x)
+
+	// A = (f00 + f11s) = 3a² - 2Conjugate(a)
+	// C = (f01 + f12s) = 3b² - 2Conjugate(c)
+	// B = (f02 + f10s) = 3c² + 2Conjugate(b)
+	f00.Add(f00, t00)
+	f01.Add(f01, t01)
+	f02.Add(f02, t02)
+	f10.Add(f10, t10)
+	f11.Add(f11, t11)
+	f12.Add(f12, t12)
+
+	return e.Set(tmp)
+}
+
+// Implicit gfP4 squaring for Granger/Scott special squaring in final expo
+// gfP4Square takes two gfP2 x, y representing the gfP4 element xu+y, where
+// u²=ξ.
+func gfP4Square(retX, retY, x, y *gfP2) {
+	t1, t2 := &gfP2{}, &gfP2{}
+
+	t1.Square(x)
+	t2.Square(y)
+
+	retX.Add(x, y)
+	retX.Square(retX)
+	retX.Sub(retX, t1)
+	retX.Sub(retX, t2) // retX = 2xy
+
+	retY.MulXi(t1)
+	retY.Add(retY, t2) // retY = x^2*xi + y^2
+}
+
 func (e *gfP12) Invert(a *gfP12) *gfP12 {
 	// See "Implementing cryptographic pairings", M. Scott, section 3.2.
 	// ftp://136.206.11.249/pub/crypto/pairings.pdf
diff --git a/gfp12_test.go b/gfp12_test.go
new file mode 100644
index 0000000..58670c4
--- /dev/null
+++ b/gfp12_test.go
@@ -0,0 +1,87 @@
+package bn256
+
+import (
+	"math/big"
+	"testing"
+)
+
+func TestGfP12SquareCyclo6(t *testing.T) {
+	// in MUST be an element of the 6-th cyclotomic group.
+	in := gfP12Gen
+
+	got := &gfP12{}
+	expected := &gfP12{}
+
+	got.SquareCyclo6(in)
+	expected.Square(in)
+
+	if *got != *expected {
+		t.Errorf("not same got=%v, expected=%v", got, expected)
+	}
+}
+
+func TestGfp12PowToVCyclo6(t *testing.T) {
+	// in MUST be an element of the 6-th cyclotomic group.
+	in := gfP12Gen
+
+	got := &gfP12{}
+	expected := &gfP12{}
+
+	got.powToVCyclo6(in)
+	expected.Exp(in, big.NewInt(1868033))
+
+	if *got != *expected {
+		t.Errorf("not same got=%v, expected=%v", got, expected)
+	}
+}
+
+func TestGfp12PowToUCyclo6(t *testing.T) {
+	// in MUST be an element of the 6-th cyclotomic group.
+	in := gfP12Gen
+
+	got := &gfP12{}
+	expected := &gfP12{}
+
+	got.PowToUCyclo6(in)
+	expected.Exp(in, u)
+
+	if *got != *expected {
+		t.Errorf("not same got=%v, expected=%v", got, expected)
+	}
+}
+
+func BenchmarkGfp12Square(b *testing.B) {
+	got := &gfP12{}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		got.Square(gfP12Gen)
+	}
+}
+
+func BenchmarkGfp12SquareCyclo6(b *testing.B) {
+	got := &gfP12{}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		got.SquareCyclo6(gfP12Gen)
+	}
+}
+
+func BenchmarkGfp12ExpU(b *testing.B) {
+	got := &gfP12{}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		got.Exp(gfP12Gen, u)
+	}
+}
+
+func BenchmarkGfp12PowToUCyclo6(b *testing.B) {
+	got := &gfP12{}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		got.PowToUCyclo6(gfP12Gen)
+	}
+}
diff --git a/optate.go b/optate.go
index 126c64c..362e1f0 100644
--- a/optate.go
+++ b/optate.go
@@ -196,9 +196,8 @@ func miller(q *twistPoint, p *curvePoint) *gfP12 {
 	r = newR
 
 	r2.Square(&minusQ2.y)
-	a, b, c, newR = lineFunctionAdd(r, minusQ2, bAffine, r2)
+	a, b, c, _ = lineFunctionAdd(r, minusQ2, bAffine, r2)
 	mulLine(ret, a, b, c)
-	r = newR
 
 	return ret
 }
@@ -218,15 +217,15 @@ func finalExponentiation(in *gfP12) *gfP12 {
 	t1.Mul(t1, inv)
 
 	t2 := (&gfP12{}).FrobeniusP2(t1)
-	t1.Mul(t1, t2)
+	t1.Mul(t1, t2) 	// t1 = in^(p^6-1)(p^2+1), where t1 becomes an element of the 6-th cyclotomic group.
 
 	fp := (&gfP12{}).Frobenius(t1)
 	fp2 := (&gfP12{}).FrobeniusP2(t1)
 	fp3 := (&gfP12{}).Frobenius(fp2)
 
-	fu := (&gfP12{}).Exp(t1, u)
-	fu2 := (&gfP12{}).Exp(fu, u)
-	fu3 := (&gfP12{}).Exp(fu2, u)
+	fu := (&gfP12{}).PowToUCyclo6(t1)
+	fu2 := (&gfP12{}).PowToUCyclo6(fu)
+	fu3 := (&gfP12{}).PowToUCyclo6(fu2)
 
 	y3 := (&gfP12{}).Frobenius(fu)
 	fu2p := (&gfP12{}).Frobenius(fu2)
@@ -245,14 +244,14 @@ func finalExponentiation(in *gfP12) *gfP12 {
 	y6 := (&gfP12{}).Mul(fu3, fu3p)
 	y6.Conjugate(y6)
 
-	t0 := (&gfP12{}).Square(y6)
+	t0 := (&gfP12{}).SquareCyclo6(y6)
 	t0.Mul(t0, y4).Mul(t0, y5)
 	t1.Mul(y3, y5).Mul(t1, t0)
 	t0.Mul(t0, y2)
-	t1.Square(t1).Mul(t1, t0).Square(t1)
+	t1.SquareCyclo6(t1).Mul(t1, t0).SquareCyclo6(t1)
 	t0.Mul(t1, y1)
 	t1.Mul(t1, y0)
-	t0.Square(t0).Mul(t0, t1)
+	t0.SquareCyclo6(t0).Mul(t0, t1)
 
 	return t0
 }