diff --git a/pqcrypto-internals/build.rs b/pqcrypto-internals/build.rs index 3741e6b..112d096 100644 --- a/pqcrypto-internals/build.rs +++ b/pqcrypto-internals/build.rs @@ -17,6 +17,10 @@ fn main() { cfiledir.join("sp800-185.c"), ]; + println!("cargo:rerun-if-changed=cfiles/"); + println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed=src/"); + let mut build = cc::Build::new(); let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap(); diff --git a/pqcrypto-internals/cfiles/keccak2x/feat.S b/pqcrypto-internals/cfiles/keccak2x/feat.S index 35aa49b..da603e9 100644 --- a/pqcrypto-internals/cfiles/keccak2x/feat.S +++ b/pqcrypto-internals/cfiles/keccak2x/feat.S @@ -29,97 +29,97 @@ SOFTWARE. .macro round // Execute theta, but without xoring into the state yet. // Compute parities p[i] = a[i] ^ a[5+i] ^ ... ^ a[20+i]. - eor3.16b v25, v0, v5, v10 - eor3.16b v26, v1, v6, v11 - eor3.16b v27, v2, v7, v12 - eor3.16b v28, v3, v8, v13 - eor3.16b v29, v4, v9, v14 - - eor3.16b v25, v25, v15, v20 - eor3.16b v26, v26, v16, v21 - eor3.16b v27, v27, v17, v22 - eor3.16b v28, v28, v18, v23 - eor3.16b v29, v29, v19, v24 - - rax1.2d v30, v29, v26 // d[0] = rotl(p[1], 1) ^ p[4] - rax1.2d v29, v27, v29 // d[3] = rotl(p[4], 1) ^ p[2] - rax1.2d v27, v25, v27 // d[1] = rotl(p[2], 1) ^ p[0] - rax1.2d v25, v28, v25 // d[4] = rotl(p[0], 1) ^ p[3] - rax1.2d v28, v26, v28 // d[2] = rotl(p[3], 1) ^ p[1] + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v29.16b, v4.16b, v9.16b, v14.16b + + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + + rax1 v30.2d, v29.2d, v26.2d // d[0] = rotl(p[1], 1) ^ p[4] + rax1 v29.2d, v27.2d, v29.2d // d[3] = rotl(p[4], 1) ^ p[2] + rax1 v27.2d, v25.2d, v27.2d // d[1] = rotl(p[2], 1) ^ p[0] + rax1 v25.2d, v28.2d, v25.2d // d[4] = rotl(p[0], 1) ^ p[3] + rax1 v28.2d, v26.2d, v28.2d // d[2] = rotl(p[3], 1) ^ p[1] // Xor parities from step theta into the state at the same time // as executing rho and pi. - eor.16b v0, v0, v30 - mov.16b v31, v1 - xar.2d v1, v6, v27, 20 - xar.2d v6, v9, v25, 44 - xar.2d v9, v22, v28, 3 - xar.2d v22, v14, v25, 25 - xar.2d v14, v20, v30, 46 - xar.2d v20, v2, v28, 2 - xar.2d v2, v12, v28, 21 - xar.2d v12, v13, v29, 39 - xar.2d v13, v19, v25, 56 - xar.2d v19, v23, v29, 8 - xar.2d v23, v15, v30, 23 - xar.2d v15, v4, v25, 37 - xar.2d v4, v24, v25, 50 - xar.2d v24, v21, v27, 62 - xar.2d v21, v8, v29, 9 - xar.2d v8, v16, v27, 19 - xar.2d v16, v5, v30, 28 - xar.2d v5, v3, v29, 36 - xar.2d v3, v18, v29, 43 - xar.2d v18, v17, v28, 49 - xar.2d v17, v11, v27, 54 - xar.2d v11, v7, v28, 58 - xar.2d v7, v10, v30, 61 - xar.2d v10, v31, v27, 63 + eor v0.16b, v0.16b, v30.16b + mov v31.16b, v1.16b + xar v1.2d, v6.2d, v27.2d, 20 + xar v6.2d, v9.2d, v25.2d, 44 + xar v9.2d, v22.2d, v28.2d, 3 + xar v22.2d, v14.2d, v25.2d, 25 + xar v14.2d, v20.2d, v30.2d, 46 + xar v20.2d, v2.2d, v28.2d, 2 + xar v2.2d, v12.2d, v28.2d, 21 + xar v12.2d, v13.2d, v29.2d, 39 + xar v13.2d, v19.2d, v25.2d, 56 + xar v19.2d, v23.2d, v29.2d, 8 + xar v23.2d, v15.2d, v30.2d, 23 + xar v15.2d, v4.2d, v25.2d, 37 + xar v4.2d, v24.2d, v25.2d, 50 + xar v24.2d, v21.2d, v27.2d, 62 + xar v21.2d, v8.2d, v29.2d, 9 + xar v8.2d, v16.2d, v27.2d, 19 + xar v16.2d, v5.2d, v30.2d, 28 + xar v5.2d, v3.2d, v29.2d, 36 + xar v3.2d, v18.2d, v29.2d, 43 + xar v18.2d, v17.2d, v28.2d, 49 + xar v17.2d, v11.2d, v27.2d, 54 + xar v11.2d, v7.2d, v28.2d, 58 + xar v7.2d, v10.2d, v30.2d, 61 + xar v10.2d, v31.2d, v27.2d, 63 // Chi - bcax.16b v25, v0, v2, v1 - bcax.16b v26, v1, v3, v2 - bcax.16b v2, v2, v4, v3 - bcax.16b v3, v3, v0, v4 - bcax.16b v4, v4, v1, v0 - mov.16b v0, v25 - mov.16b v1, v26 - - bcax.16b v25, v5, v7, v6 - bcax.16b v26, v6, v8, v7 - bcax.16b v7, v7, v9, v8 - bcax.16b v8, v8, v5, v9 - bcax.16b v9, v9, v6, v5 - mov.16b v5, v25 - mov.16b v6, v26 - - bcax.16b v25, v10, v12, v11 - bcax.16b v26, v11, v13, v12 - bcax.16b v12, v12, v14, v13 - bcax.16b v13, v13, v10, v14 - bcax.16b v14, v14, v11, v10 - mov.16b v10, v25 - mov.16b v11, v26 - - bcax.16b v25, v15, v17, v16 - bcax.16b v26, v16, v18, v17 - bcax.16b v17, v17, v19, v18 - bcax.16b v18, v18, v15, v19 - bcax.16b v19, v19, v16, v15 - mov.16b v15, v25 - mov.16b v16, v26 - - bcax.16b v25, v20, v22, v21 - bcax.16b v26, v21, v23, v22 - bcax.16b v22, v22, v24, v23 - bcax.16b v23, v23, v20, v24 - bcax.16b v24, v24, v21, v20 - mov.16b v20, v25 - mov.16b v21, v26 + bcax v25.16b, v0.16b, v2.16b, v1.16b + bcax v26.16b, v1.16b, v3.16b, v2.16b + bcax v2.16b, v2.16b, v4.16b, v3.16b + bcax v3.16b, v3.16b, v0.16b, v4.16b + bcax v4.16b, v4.16b, v1.16b, v0.16b + mov v0.16b, v25.16b + mov v1.16b, v26.16b + + bcax v25.16b, v5.16b, v7.16b, v6.16b + bcax v26.16b, v6.16b, v8.16b, v7.16b + bcax v7.16b, v7.16b, v9.16b, v8.16b + bcax v8.16b, v8.16b, v5.16b, v9.16b + bcax v9.16b, v9.16b, v6.16b, v5.16b + mov v5.16b, v25.16b + mov v6.16b, v26.16b + + bcax v25.16b, v10.16b, v12.16b, v11.16b + bcax v26.16b, v11.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v10.16b, v14.16b + bcax v14.16b, v14.16b, v11.16b, v10.16b + mov v10.16b, v25.16b + mov v11.16b, v26.16b + + bcax v25.16b, v15.16b, v17.16b, v16.16b + bcax v26.16b, v16.16b, v18.16b, v17.16b + bcax v17.16b, v17.16b, v19.16b, v18.16b + bcax v18.16b, v18.16b, v15.16b, v19.16b + bcax v19.16b, v19.16b, v16.16b, v15.16b + mov v15.16b, v25.16b + mov v16.16b, v26.16b + + bcax v25.16b, v20.16b, v22.16b, v21.16b + bcax v26.16b, v21.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v20.16b, v24.16b + bcax v24.16b, v24.16b, v21.16b, v20.16b + mov v20.16b, v25.16b + mov v21.16b, v26.16b // iota ld1r {v25.2d}, [x1], #8 - eor.16b v0, v0, v25 + eor v0.16b, v0.16b, v25.16b .endm .align 4 @@ -135,13 +135,13 @@ _f1600x2: mov x2, x0 mov x3, #24 - ld1.2d {v0, v1, v2, v3}, [x0], #64 - ld1.2d {v4, v5, v6, v7}, [x0], #64 - ld1.2d {v8, v9, v10, v11}, [x0], #64 - ld1.2d {v12, v13, v14, v15}, [x0], #64 - ld1.2d {v16, v17, v18, v19}, [x0], #64 - ld1.2d {v20, v21, v22, v23}, [x0], #64 - ld1.2d {v24}, [x0] + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + ld1 {v24.2d}, [x0] loop: round @@ -150,13 +150,13 @@ loop: cbnz x3, loop mov x0, x2 - st1.2d {v0, v1, v2, v3}, [x0], #64 - st1.2d {v4, v5, v6, v7}, [x0], #64 - st1.2d {v8, v9, v10, v11}, [x0], #64 - st1.2d {v12, v13, v14, v15}, [x0], #64 - st1.2d {v16, v17, v18, v19}, [x0], #64 - st1.2d {v20, v21, v22, v23}, [x0], #64 - st1.2d {v24}, [x0] + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 + st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 + st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 + st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 + st1 {v24.2d}, [x0] ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16