diff --git a/sign/internal/dilithium/arm64.go b/sign/internal/dilithium/arm64.go index dedcb596..83f1e867 100644 --- a/sign/internal/dilithium/arm64.go +++ b/sign/internal/dilithium/arm64.go @@ -81,7 +81,7 @@ func (p *Poly) Exceeds(bound uint32) bool { // // So it requires the coefficients of p to be less than 2³²⁻ᴰ. func (p *Poly) MulBy2toD(q *Poly) { - p.mulBy2toDGeneric(q) + polyMulBy2toDARM64(p, q) } // Splits p into p1 and p0 such that [i]p1 * 2ᴰ + [i]p0 = [i]p @@ -98,3 +98,6 @@ func polyAddARM64(p, a, b *Poly) //go:noescape func polyPackLe16ARM64(p *Poly, buf *byte) + +//go:noescape +func polyMulBy2toDARM64(p, q *Poly) diff --git a/sign/internal/dilithium/arm64.s b/sign/internal/dilithium/arm64.s index f609e9a0..47eb7958 100644 --- a/sign/internal/dilithium/arm64.s +++ b/sign/internal/dilithium/arm64.s @@ -69,3 +69,25 @@ loop: BGT loop RET + +// func polyMulBy2toDARM64(p, q *Poly) +TEXT ·polyMulBy2toDARM64(SB), NOSPLIT|NOFRAME, $0-16 + MOVD p+0(FP), R0 + MOVD q+8(FP), R1 + + MOVW $(const_N / 16), R2 + +loop: + VLD1.P (64)(R1), [V0.S4, V1.S4, V2.S4, V3.S4] + + VSHL $(const_D), V0.S4, V0.S4 + VSHL $(const_D), V1.S4, V1.S4 + VSHL $(const_D), V2.S4, V2.S4 + VSHL $(const_D), V3.S4, V3.S4 + + VST1.P [V0.S4, V1.S4, V2.S4, V3.S4], (64)(R0) + + SUBS $1, R2, R2 + BGT loop + + RET