diff --git a/internal/poly1305/_asm/go.mod b/internal/poly1305/_asm/go.mod new file mode 100644 index 0000000000..47f2b758ef --- /dev/null +++ b/internal/poly1305/_asm/go.mod @@ -0,0 +1,15 @@ +module internal/poly1305/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/internal/poly1305/_asm/go.sum b/internal/poly1305/_asm/go.sum new file mode 100644 index 0000000000..62ea9dfb70 --- /dev/null +++ b/internal/poly1305/_asm/go.sum @@ -0,0 +1,12 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/internal/poly1305/_asm/sum_amd64_asm.go b/internal/poly1305/_asm/sum_amd64_asm.go new file mode 100644 index 0000000000..a445c68f01 --- /dev/null +++ b/internal/poly1305/_asm/sum_amd64_asm.go @@ -0,0 +1,126 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/sha3" +) + +//go:generate go run . -out ../sum_amd64.s -pkg poly1305 + +func main() { + Package("golang.org/x/crypto/internal/poly1305") + ConstraintExpr("gc,!purego") + update() + Generate() +} + +func update() { + Implement("update") + + Load(Param("state"), RDI) + MOVQ(NewParamAddr("msg_base", 8), RSI) + MOVQ(NewParamAddr("msg_len", 16), R15) + + MOVQ(Mem{Base: DI}.Offset(0), R8) // h0 + MOVQ(Mem{Base: DI}.Offset(8), R9) // h1 + MOVQ(Mem{Base: DI}.Offset(16), R10) // h2 + MOVQ(Mem{Base: DI}.Offset(24), R11) // r0 + MOVQ(Mem{Base: DI}.Offset(32), R12) // r1 + + CMPQ(R15, Imm(16)) + JB(LabelRef("bytes_between_0_and_15")) + + Label("loop") + POLY1305_ADD(RSI, R8, R9, R10) + + Label("multiply") + POLY1305_MUL(R8, R9, R10, R11, R12, RBX, RCX, R13, R14) + SUBQ(Imm(16), R15) + CMPQ(R15, Imm(16)) + JAE(LabelRef("loop")) + + Label("bytes_between_0_and_15") + TESTQ(R15, R15) + JZ(LabelRef("done")) + MOVQ(U32(1), RBX) + XORQ(RCX, RCX) + XORQ(R13, R13) + ADDQ(R15, RSI) + + Label("flush_buffer") + SHLQ(Imm(8), RBX, RCX) + SHLQ(Imm(8), RBX) + MOVB(Mem{Base: SI}.Offset(-1), R13B) + XORQ(R13, RBX) + DECQ(RSI) + DECQ(R15) + JNZ(LabelRef("flush_buffer")) + + ADDQ(RBX, R8) + ADCQ(RCX, R9) + ADCQ(Imm(0), R10) + MOVQ(U32(16), R15) + JMP(LabelRef("multiply")) + + Label("done") + MOVQ(R8, Mem{Base: DI}.Offset(0)) + MOVQ(R9, Mem{Base: DI}.Offset(8)) + MOVQ(R10, Mem{Base: DI}.Offset(16)) + RET() +} + +func POLY1305_ADD(msg, h0, h1, h2 GPPhysical) { + ADDQ(Mem{Base: msg}.Offset(0), h0) + ADCQ(Mem{Base: msg}.Offset(8), h1) + ADCQ(Imm(1), h2) + LEAQ(Mem{Base: msg}.Offset(16), msg) +} + +func POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3 GPPhysical) { + MOVQ(r0, RAX) + MULQ(h0) + MOVQ(RAX, t0) + MOVQ(RDX, t1) + MOVQ(r0, RAX) + MULQ(h1) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(r0, t2) + IMULQ(h2, t2) + ADDQ(RDX, t2) + + MOVQ(r1, RAX) + MULQ(h0) + ADDQ(RAX, t1) + ADCQ(Imm(0), RDX) + MOVQ(RDX, h0) + MOVQ(r1, t3) + IMULQ(h2, t3) + MOVQ(r1, RAX) + MULQ(h1) + ADDQ(RAX, t2) + ADCQ(RDX, t3) + ADDQ(h0, t2) + ADCQ(Imm(0), t3) + + MOVQ(t0, h0) + MOVQ(t1, h1) + MOVQ(t2, h2) + ANDQ(Imm(3), h2) + MOVQ(t2, t0) + ANDQ(I32(-4), t0) + ADDQ(t0, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) + SHRQ(Imm(2), t3, t2) + SHRQ(Imm(2), t3) + ADDQ(t2, h0) + ADCQ(t3, h1) + ADCQ(Imm(0), h2) +} diff --git a/internal/poly1305/sum_amd64.s b/internal/poly1305/sum_amd64.s index e0d3c64756..133757384b 100644 --- a/internal/poly1305/sum_amd64.s +++ b/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET