diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 93ea4de0c2..5406895f26 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -180,7 +180,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) -if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0"))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) cmake_dependent_option(OQS_ENABLE_KEM_kyber_512_aarch64 "" ON "OQS_ENABLE_KEM_kyber_512" OFF) endif() endif() @@ -195,7 +195,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) -if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0"))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) cmake_dependent_option(OQS_ENABLE_KEM_kyber_768_aarch64 "" ON "OQS_ENABLE_KEM_kyber_768" OFF) endif() endif() @@ -210,7 +210,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) -if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0"))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) cmake_dependent_option(OQS_ENABLE_KEM_kyber_1024_aarch64 "" ON "OQS_ENABLE_KEM_kyber_1024" OFF) endif() endif() @@ -336,6 +336,14 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) + cmake_dependent_option(OQS_ENABLE_KEM_saber_lightsaber_aarch64 "" ON "OQS_ENABLE_KEM_saber_lightsaber" OFF) +endif() +endif() +endif() + cmake_dependent_option(OQS_ENABLE_KEM_saber_saber "" ON "OQS_ENABLE_KEM_SABER" OFF) if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) @@ -343,6 +351,14 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) + cmake_dependent_option(OQS_ENABLE_KEM_saber_saber_aarch64 "" ON "OQS_ENABLE_KEM_saber_saber" OFF) +endif() +endif() +endif() + cmake_dependent_option(OQS_ENABLE_KEM_saber_firesaber "" ON "OQS_ENABLE_KEM_SABER" OFF) if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) @@ -350,6 +366,14 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS))) +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) + cmake_dependent_option(OQS_ENABLE_KEM_saber_firesaber_aarch64 "" ON "OQS_ENABLE_KEM_saber_firesaber" OFF) +endif() +endif() +endif() + option(OQS_ENABLE_SIG_DILITHIUM "Enable dilithium algorithm family" ON) cmake_dependent_option(OQS_ENABLE_SIG_dilithium_2 "" ON "OQS_ENABLE_SIG_DILITHIUM" OFF) diff --git a/docs/algorithms/kem/classic_mceliece.md b/docs/algorithms/kem/classic_mceliece.md index af7ecee8a0..fefa2ed7f9 100644 --- a/docs/algorithms/kem/classic_mceliece.md +++ b/docs/algorithms/kem/classic_mceliece.md @@ -6,7 +6,7 @@ - **Authors' website**: https://classic.mceliece.org - **Specification version**: SUPERCOP-20191221. - **Primary Source**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 - **Implementation license (SPDX-Identifier)**: Public domain , which takes it from: - SUPERCOP-20191221 "vec" and "avx" implementations diff --git a/docs/algorithms/kem/classic_mceliece.yml b/docs/algorithms/kem/classic_mceliece.yml index fe66c10541..e1835b9826 100644 --- a/docs/algorithms/kem/classic_mceliece.yml +++ b/docs/algorithms/kem/classic_mceliece.yml @@ -369,4 +369,4 @@ parameter-sets: auxiliary-submitters: [] primary-upstream: spdx-license-identifier: Public domain - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 diff --git a/docs/algorithms/kem/hqc.md b/docs/algorithms/kem/hqc.md index 5d365a59dc..bc6a1ef6d8 100644 --- a/docs/algorithms/kem/hqc.md +++ b/docs/algorithms/kem/hqc.md @@ -6,7 +6,7 @@ - **Authors' website**: https://pqc-hqc.org/ - **Specification version**: NIST Round 3 submission. - **Primary Source**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 - **Implementation license (SPDX-Identifier)**: Public domain , which takes it from: - https://github.com/jschanck/package-pqclean/tree/29f79e72/hqc, which takes it from: diff --git a/docs/algorithms/kem/hqc.yml b/docs/algorithms/kem/hqc.yml index 521a2b11f9..ad3d1b9d02 100644 --- a/docs/algorithms/kem/hqc.yml +++ b/docs/algorithms/kem/hqc.yml @@ -122,4 +122,4 @@ parameter-sets: upstream: primary-upstream primary-upstream: spdx-license-identifier: Public domain - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 diff --git a/docs/algorithms/kem/kyber.md b/docs/algorithms/kem/kyber.md index df02fb7598..5eb3456852 100644 --- a/docs/algorithms/kem/kyber.md +++ b/docs/algorithms/kem/kyber.md @@ -11,7 +11,7 @@ - **Implementation license (SPDX-Identifier)**: CC0-1.0 - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/faf5c3fe33e0b61c7c8a7888dd862bf5def17ad2 with copy_from_upstream patches - **pqclean-aarch64**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 with copy_from_upstream patches + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: CC0-1.0 diff --git a/docs/algorithms/kem/kyber.yml b/docs/algorithms/kem/kyber.yml index ae6701ec1e..052618abe3 100644 --- a/docs/algorithms/kem/kyber.yml +++ b/docs/algorithms/kem/kyber.yml @@ -22,7 +22,7 @@ primary-upstream: spdx-license-identifier: CC0-1.0 optimized-upstreams: pqclean-aarch64: - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 with copy_from_upstream patches spdx-license-identifier: CC0-1.0 parameter-sets: diff --git a/docs/algorithms/kem/ntru.md b/docs/algorithms/kem/ntru.md index 4d15a45c4a..b4943d8a4b 100644 --- a/docs/algorithms/kem/ntru.md +++ b/docs/algorithms/kem/ntru.md @@ -7,7 +7,7 @@ - **Authors' website**: https://ntru.org/ - **Specification version**: NIST Round 3 submission. - **Primary Source**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 - **Implementation license (SPDX-Identifier)**: CC0-1.0 , which takes it from: - https://github.com/jschanck/ntru/tree/a43a4457 diff --git a/docs/algorithms/kem/ntru.yml b/docs/algorithms/kem/ntru.yml index cab84a55e8..8ee50e55ee 100644 --- a/docs/algorithms/kem/ntru.yml +++ b/docs/algorithms/kem/ntru.yml @@ -185,4 +185,4 @@ parameter-sets: upstream: primary-upstream primary-upstream: spdx-license-identifier: CC0-1.0 - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 diff --git a/docs/algorithms/kem/ntruprime.md b/docs/algorithms/kem/ntruprime.md index 2e0f1f1c38..d89d4f15df 100644 --- a/docs/algorithms/kem/ntruprime.md +++ b/docs/algorithms/kem/ntruprime.md @@ -6,7 +6,7 @@ - **Authors' website**: https://ntruprime.cr.yp.to - **Specification version**: supercop-20200826. - **Primary Source**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 - **Implementation license (SPDX-Identifier)**: Public domain , which takes it from: - https://github.com/jschanck/package-pqclean/tree/4d9f08c3/ntruprime, which takes it from: diff --git a/docs/algorithms/kem/ntruprime.yml b/docs/algorithms/kem/ntruprime.yml index 9bd9114065..41c9b7017f 100644 --- a/docs/algorithms/kem/ntruprime.yml +++ b/docs/algorithms/kem/ntruprime.yml @@ -285,4 +285,4 @@ parameter-sets: upstream: primary-upstream primary-upstream: spdx-license-identifier: Public domain - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 diff --git a/docs/algorithms/kem/saber.md b/docs/algorithms/kem/saber.md index b69ed54ce1..022a5c58a5 100644 --- a/docs/algorithms/kem/saber.md +++ b/docs/algorithms/kem/saber.md @@ -6,7 +6,7 @@ - **Authors' website**: https://www.esat.kuleuven.be/cosic/pqcrypto/saber/ - **Specification version**: NIST Round 3 submission. - **Primary Source**: - - **Source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + - **Source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: Public domain , which takes it from: - https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber, which takes it from: @@ -26,6 +26,7 @@ |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------| | [Primary Source](#primary-source) | clean | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2 | False | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -37,6 +38,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | clean | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2 | False | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -46,6 +48,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | clean | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2 | False | True | False | +| [Primary Source](#primary-source) | aarch64 | ARM64\_V8 | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. diff --git a/docs/algorithms/kem/saber.yml b/docs/algorithms/kem/saber.yml index d2b6b49794..49caf97d3d 100644 --- a/docs/algorithms/kem/saber.yml +++ b/docs/algorithms/kem/saber.yml @@ -44,6 +44,18 @@ parameter-sets: no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false upstream: primary-upstream + - upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false + upstream: primary-upstream - name: Saber-KEM claimed-nist-level: 3 claimed-security: IND-CCA2 @@ -75,6 +87,18 @@ parameter-sets: no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false upstream: primary-upstream + - upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false + upstream: primary-upstream - name: FireSaber-KEM claimed-nist-level: 5 claimed-security: IND-CCA2 @@ -106,6 +130,19 @@ parameter-sets: no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false upstream: primary-upstream + - upstream-id: aarch64 + supported-platforms: + - architecture: ARM64_V8 + operating_systems: + - Linux + - Darwin + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false + upstream: primary-upstream primary-upstream: spdx-license-identifier: Public domain - source: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 + source: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 + with copy_from_upstream patches diff --git a/docs/algorithms/sig/falcon.md b/docs/algorithms/sig/falcon.md index 9e13a2fab0..ffe014723c 100644 --- a/docs/algorithms/sig/falcon.md +++ b/docs/algorithms/sig/falcon.md @@ -6,7 +6,7 @@ - **Auxiliary submitters**: Pierre-Alain Fouque, Jeffrey Hoffstein, Paul Kirchner, Vadim Lyubashevsky, Thomas Pornin, Thomas Ricosset, Gregor Seiler, William Whyte, Zhenfei Zhang. - **Authors' website**: https://falcon-sign.info - **Specification version**: v1.2. -- **Implementation source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9, which takes it from: +- **Implementation source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16, which takes it from: - https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon, which takes it from: - supercop-20201018 - **Implementation license (SPDX-Identifier)**: CC0-1.0. diff --git a/docs/algorithms/sig/falcon.yml b/docs/algorithms/sig/falcon.yml index 9ba9917909..a85201f11f 100644 --- a/docs/algorithms/sig/falcon.yml +++ b/docs/algorithms/sig/falcon.yml @@ -17,7 +17,7 @@ website: https://falcon-sign.info nist-round: 3 spec-version: v1.2 spdx-license-identifier: CC0-1.0 -upstream: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 +upstream: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 upstream-ancestors: - https://github.com/jschanck/package-pqclean/tree/cea1fa5a/falcon - supercop-20201018 diff --git a/docs/algorithms/sig/rainbow.md b/docs/algorithms/sig/rainbow.md index c59e917af0..13ee7e8c76 100644 --- a/docs/algorithms/sig/rainbow.md +++ b/docs/algorithms/sig/rainbow.md @@ -6,7 +6,7 @@ - **Auxiliary submitters**: Ming-Shing Chen, Matthias Kannwischer, Jacques Patarin, Albrecht Petzoldt, Dieter Schmidt, Bo-Yin Yang. - **Authors' website**: https://www.pqcrainbow.org/ - **Specification version**: NIST Round 3 submission. -- **Implementation source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9, which takes it from: +- **Implementation source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16, which takes it from: - https://github.com/fast-crypto-lab/rainbow-submission-round2/commit/173ada0e077e1b9dbd8e4a78994f87acc0c92263 - **Implementation license (SPDX-Identifier)**: CC0-1.0. diff --git a/docs/algorithms/sig/rainbow.yml b/docs/algorithms/sig/rainbow.yml index e009679e2a..67342ceb4b 100644 --- a/docs/algorithms/sig/rainbow.yml +++ b/docs/algorithms/sig/rainbow.yml @@ -14,7 +14,7 @@ website: https://www.pqcrainbow.org/ nist-round: 3 spec-version: NIST Round 3 submission spdx-license-identifier: CC0-1.0 -upstream: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 +upstream: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 upstream-ancestors: - https://github.com/fast-crypto-lab/rainbow-submission-round2/commit/173ada0e077e1b9dbd8e4a78994f87acc0c92263 parameter-sets: diff --git a/docs/algorithms/sig/sphincs.md b/docs/algorithms/sig/sphincs.md index c3caafcc53..a3efd95bd4 100644 --- a/docs/algorithms/sig/sphincs.md +++ b/docs/algorithms/sig/sphincs.md @@ -6,7 +6,7 @@ - **Auxiliary submitters**: Jean-Philippe Aumasson, Daniel J. Bernstein,, Christoph Dobraunig, Maria Eichlseder, Scott Fluhrer, Stefan-Lukas Gazdag, Panos Kampanakis, Stefan Kölbl, Tanja Lange, Martin M. Lauridsen, Florian Mendel, Ruben Niederhagen, Christian Rechberger, Joost Rijneveld, Peter Schwabe. - **Authors' website**: https://sphincs.org/ - **Specification version**: NIST Round 3 submission. -- **Implementation source**: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 with copy_from_upstream patches, which takes it from: +- **Implementation source**: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 with copy_from_upstream patches, which takes it from: - https://github.com/sphincs/sphincsplus - **Implementation license (SPDX-Identifier)**: CC0-1.0. diff --git a/docs/algorithms/sig/sphincs.yml b/docs/algorithms/sig/sphincs.yml index fe7f1620c6..a32f83b3f5 100644 --- a/docs/algorithms/sig/sphincs.yml +++ b/docs/algorithms/sig/sphincs.yml @@ -23,7 +23,7 @@ website: https://sphincs.org/ nist-round: 3 spec-version: NIST Round 3 submission spdx-license-identifier: CC0-1.0 -upstream: https://github.com/PQClean/PQClean/commit/f365dcfe69f15325443ce65df9798e42816f78e9 +upstream: https://github.com/PQClean/PQClean/commit/6a32796212b79a5f9126d0a933e1216313f50c16 with copy_from_upstream patches upstream-ancestors: - https://github.com/sphincs/sphincsplus diff --git a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg.fragment b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg.fragment index c318262b75..a90de13787 100644 --- a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg.fragment +++ b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg.fragment @@ -16,11 +16,11 @@ endif() {% if platform['operating_systems'] %}if(CMAKE_SYSTEM_NAME MATCHES "{{ platform['operating_systems']|join('|') }}") {% endif -%} if((OQS_DIST_ARM64_V8_BUILD OR (OQS_USE_ARM_NEON_INSTRUCTIONS{% for flag in platform['required_flags'] -%} {%- if not loop.last or loop.first %} AND {% endif -%}OQS_USE_ARM_{{ flag|upper }}_INSTRUCTIONS {%- endfor -%}))) -{% if family['name'] == "kyber" and impl['upstream']['name'] == 'pqclean' -%} -if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0"))) +{% if impl['upstream']['name'] == 'pqclean' -%} +if(((CMAKE_C_COMPILER_ID STREQUAL "GNU") AND (CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "9.4.0")) OR ((NOT (CMAKE_C_COMPILER_ID STREQUAL "GNU")) AND (NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))) {%- endif %} cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}" OFF) -{% if family['name'] == "kyber" and impl['upstream']['name'] == 'pqclean' -%} +{% if impl['upstream']['name'] == 'pqclean' -%} endif() {%- endif %} endif() diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index c55502da90..5bffac6525 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -3,13 +3,12 @@ upstreams: name: pqclean git_url: https://github.com/PQClean/PQClean.git git_branch: master - git_commit: f365dcfe69f15325443ce65df9798e42816f78e9 + git_commit: 6a32796212b79a5f9126d0a933e1216313f50c16 kem_meta_path: 'crypto_kem/{pqclean_scheme}/META.yml' sig_meta_path: 'crypto_sign/{pqclean_scheme}/META.yml' kem_scheme_path: 'crypto_kem/{pqclean_scheme}' sig_scheme_path: 'crypto_sign/{pqclean_scheme}' - ignore: ['pqclean_lightsaber_aarch64', 'pqclean_saber_aarch64', 'pqclean_firesaber_aarch64'] - patches: ['pqclean-sphincs.patch', 'pqclean-kyber-armneon-shake.patch'] + patches: ['pqclean-sphincs.patch', 'pqclean-kyber-armneon-shake.patch', 'pqclean-saber-armneon-shake.patch'] - name: pqcrystals-kyber git_url: https://github.com/pq-crystals/kyber.git diff --git a/scripts/copy_from_upstream/patches/pqclean-saber-armneon-shake.patch b/scripts/copy_from_upstream/patches/pqclean-saber-armneon-shake.patch new file mode 100644 index 0000000000..d96a4c15c1 --- /dev/null +++ b/scripts/copy_from_upstream/patches/pqclean-saber-armneon-shake.patch @@ -0,0 +1,63 @@ +diff --git a/crypto_kem/firesaber/aarch64/fips202x2.h b/crypto_kem/firesaber/aarch64/fips202x2.h +index d260245..11579f3 100644 +--- a/crypto_kem/firesaber/aarch64/fips202x2.h ++++ b/crypto_kem/firesaber/aarch64/fips202x2.h +@@ -4,15 +4,9 @@ + #include "SABER_params.h" + #include + #include +- ++#include "fips202.h" + typedef uint64x2_t v128; + +-#define SHAKE128_RATE 168 +-#define SHAKE256_RATE 136 +-#define SHA3_256_RATE 136 +-#define SHA3_512_RATE 72 +- +- + typedef struct { + v128 s[25]; + } keccakx2_state; +diff --git a/crypto_kem/lightsaber/aarch64/fips202x2.h b/crypto_kem/lightsaber/aarch64/fips202x2.h +index d260245..11579f3 100644 +--- a/crypto_kem/lightsaber/aarch64/fips202x2.h ++++ b/crypto_kem/lightsaber/aarch64/fips202x2.h +@@ -4,15 +4,9 @@ + #include "SABER_params.h" + #include + #include +- ++#include "fips202.h" + typedef uint64x2_t v128; + +-#define SHAKE128_RATE 168 +-#define SHAKE256_RATE 136 +-#define SHA3_256_RATE 136 +-#define SHA3_512_RATE 72 +- +- + typedef struct { + v128 s[25]; + } keccakx2_state; +diff --git a/crypto_kem/saber/aarch64/fips202x2.h b/crypto_kem/saber/aarch64/fips202x2.h +index d260245..11579f3 100644 +--- a/crypto_kem/saber/aarch64/fips202x2.h ++++ b/crypto_kem/saber/aarch64/fips202x2.h +@@ -4,15 +4,9 @@ + #include "SABER_params.h" + #include + #include +- ++#include "fips202.h" + typedef uint64x2_t v128; + +-#define SHAKE128_RATE 168 +-#define SHAKE256_RATE 136 +-#define SHA3_256_RATE 136 +-#define SHA3_512_RATE 72 +- +- + typedef struct { + v128 s[25]; + } keccakx2_state; diff --git a/src/kem/saber/CMakeLists.txt b/src/kem/saber/CMakeLists.txt index 87e8f0a8ac..34a5502fd5 100644 --- a/src/kem/saber/CMakeLists.txt +++ b/src/kem/saber/CMakeLists.txt @@ -20,6 +20,13 @@ if(OQS_ENABLE_KEM_saber_lightsaber_avx2) set(_SABER_OBJS ${_SABER_OBJS} $) endif() +if(OQS_ENABLE_KEM_saber_lightsaber_aarch64) + add_library(saber_lightsaber_aarch64 OBJECT pqclean_lightsaber_aarch64/__asm_iNTT.S pqclean_lightsaber_aarch64/__asm_mul.S pqclean_lightsaber_aarch64/__asm_narrow.S pqclean_lightsaber_aarch64/__asm_NTT.S pqclean_lightsaber_aarch64/__asm_pack_unpack.S pqclean_lightsaber_aarch64/cbd.c pqclean_lightsaber_aarch64/fips202x2.c pqclean_lightsaber_aarch64/kem.c pqclean_lightsaber_aarch64/pack_unpack.c pqclean_lightsaber_aarch64/SABER_indcpa.c pqclean_lightsaber_aarch64/verify.c) + target_include_directories(saber_lightsaber_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_lightsaber_aarch64) + target_include_directories(saber_lightsaber_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + set(_SABER_OBJS ${_SABER_OBJS} $) +endif() + if(OQS_ENABLE_KEM_saber_saber) add_library(saber_saber_clean OBJECT kem_saber_saber.c pqclean_saber_clean/cbd.c pqclean_saber_clean/kem.c pqclean_saber_clean/pack_unpack.c pqclean_saber_clean/poly.c pqclean_saber_clean/poly_mul.c pqclean_saber_clean/SABER_indcpa.c pqclean_saber_clean/verify.c) target_include_directories(saber_saber_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_saber_clean) @@ -35,6 +42,13 @@ if(OQS_ENABLE_KEM_saber_saber_avx2) set(_SABER_OBJS ${_SABER_OBJS} $) endif() +if(OQS_ENABLE_KEM_saber_saber_aarch64) + add_library(saber_saber_aarch64 OBJECT pqclean_saber_aarch64/__asm_iNTT.S pqclean_saber_aarch64/__asm_mul.S pqclean_saber_aarch64/__asm_narrow.S pqclean_saber_aarch64/__asm_NTT.S pqclean_saber_aarch64/__asm_pack_unpack.S pqclean_saber_aarch64/cbd.c pqclean_saber_aarch64/fips202x2.c pqclean_saber_aarch64/kem.c pqclean_saber_aarch64/pack_unpack.c pqclean_saber_aarch64/SABER_indcpa.c pqclean_saber_aarch64/verify.c) + target_include_directories(saber_saber_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_saber_aarch64) + target_include_directories(saber_saber_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + set(_SABER_OBJS ${_SABER_OBJS} $) +endif() + if(OQS_ENABLE_KEM_saber_firesaber) add_library(saber_firesaber_clean OBJECT kem_saber_firesaber.c pqclean_firesaber_clean/cbd.c pqclean_firesaber_clean/kem.c pqclean_firesaber_clean/pack_unpack.c pqclean_firesaber_clean/poly.c pqclean_firesaber_clean/poly_mul.c pqclean_firesaber_clean/SABER_indcpa.c pqclean_firesaber_clean/verify.c) target_include_directories(saber_firesaber_clean PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_firesaber_clean) @@ -50,4 +64,11 @@ if(OQS_ENABLE_KEM_saber_firesaber_avx2) set(_SABER_OBJS ${_SABER_OBJS} $) endif() +if(OQS_ENABLE_KEM_saber_firesaber_aarch64) + add_library(saber_firesaber_aarch64 OBJECT pqclean_firesaber_aarch64/__asm_iNTT.S pqclean_firesaber_aarch64/__asm_mul.S pqclean_firesaber_aarch64/__asm_narrow.S pqclean_firesaber_aarch64/__asm_NTT.S pqclean_firesaber_aarch64/__asm_pack_unpack.S pqclean_firesaber_aarch64/cbd.c pqclean_firesaber_aarch64/fips202x2.c pqclean_firesaber_aarch64/kem.c pqclean_firesaber_aarch64/pack_unpack.c pqclean_firesaber_aarch64/SABER_indcpa.c pqclean_firesaber_aarch64/verify.c) + target_include_directories(saber_firesaber_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqclean_firesaber_aarch64) + target_include_directories(saber_firesaber_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + set(_SABER_OBJS ${_SABER_OBJS} $) +endif() + set(SABER_OBJS ${_SABER_OBJS} PARENT_SCOPE) diff --git a/src/kem/saber/kem_saber_firesaber.c b/src/kem/saber/kem_saber_firesaber.c index 679b13caf4..b1ede4b0dc 100644 --- a/src/kem/saber/kem_saber_firesaber.c +++ b/src/kem/saber/kem_saber_firesaber.c @@ -40,6 +40,12 @@ extern int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const extern int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_ENABLE_KEM_saber_firesaber_aarch64) +extern int PQCLEAN_FIRESABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +extern int PQCLEAN_FIRESABER_AARCH64_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int PQCLEAN_FIRESABER_AARCH64_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_saber_firesaber_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_ENABLE_KEM_saber_firesaber_avx2) #if defined(OQS_DIST_BUILD) @@ -51,6 +57,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_firesaber_keypair(uint8_t *public_key, uint8_t return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(public_key, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_firesaber_aarch64) + return (OQS_STATUS) PQCLEAN_FIRESABER_AARCH64_crypto_kem_keypair(public_key, secret_key); #else return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(public_key, secret_key); #endif @@ -67,6 +75,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_firesaber_encaps(uint8_t *ciphertext, uint8_t * return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_firesaber_aarch64) + return (OQS_STATUS) PQCLEAN_FIRESABER_AARCH64_crypto_kem_enc(ciphertext, shared_secret, public_key); #else return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); #endif @@ -83,6 +93,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_firesaber_decaps(uint8_t *shared_secret, const return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_firesaber_aarch64) + return (OQS_STATUS) PQCLEAN_FIRESABER_AARCH64_crypto_kem_dec(shared_secret, ciphertext, secret_key); #else return (OQS_STATUS) PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); #endif diff --git a/src/kem/saber/kem_saber_lightsaber.c b/src/kem/saber/kem_saber_lightsaber.c index b19a6ee4ca..8dfe721a2b 100644 --- a/src/kem/saber/kem_saber_lightsaber.c +++ b/src/kem/saber/kem_saber_lightsaber.c @@ -40,6 +40,12 @@ extern int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, cons extern int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_ENABLE_KEM_saber_lightsaber_aarch64) +extern int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +extern int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_saber_lightsaber_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_ENABLE_KEM_saber_lightsaber_avx2) #if defined(OQS_DIST_BUILD) @@ -51,6 +57,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_lightsaber_keypair(uint8_t *public_key, uint8_t return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(public_key, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_lightsaber_aarch64) + return (OQS_STATUS) PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_keypair(public_key, secret_key); #else return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(public_key, secret_key); #endif @@ -67,6 +75,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_lightsaber_encaps(uint8_t *ciphertext, uint8_t return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_lightsaber_aarch64) + return (OQS_STATUS) PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_enc(ciphertext, shared_secret, public_key); #else return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); #endif @@ -83,6 +93,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_lightsaber_decaps(uint8_t *shared_secret, const return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_lightsaber_aarch64) + return (OQS_STATUS) PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_dec(shared_secret, ciphertext, secret_key); #else return (OQS_STATUS) PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); #endif diff --git a/src/kem/saber/kem_saber_saber.c b/src/kem/saber/kem_saber_saber.c index 5934367896..7a72274451 100644 --- a/src/kem/saber/kem_saber_saber.c +++ b/src/kem/saber/kem_saber_saber.c @@ -40,6 +40,12 @@ extern int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uin extern int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_ENABLE_KEM_saber_saber_aarch64) +extern int PQCLEAN_SABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +extern int PQCLEAN_SABER_AARCH64_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int PQCLEAN_SABER_AARCH64_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif + OQS_API OQS_STATUS OQS_KEM_saber_saber_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_ENABLE_KEM_saber_saber_avx2) #if defined(OQS_DIST_BUILD) @@ -51,6 +57,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_saber_keypair(uint8_t *public_key, uint8_t *sec return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_keypair(public_key, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_saber_aarch64) + return (OQS_STATUS) PQCLEAN_SABER_AARCH64_crypto_kem_keypair(public_key, secret_key); #else return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_keypair(public_key, secret_key); #endif @@ -67,6 +75,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_saber_encaps(uint8_t *ciphertext, uint8_t *shar return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_saber_aarch64) + return (OQS_STATUS) PQCLEAN_SABER_AARCH64_crypto_kem_enc(ciphertext, shared_secret, public_key); #else return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_enc(ciphertext, shared_secret, public_key); #endif @@ -83,6 +93,8 @@ OQS_API OQS_STATUS OQS_KEM_saber_saber_decaps(uint8_t *shared_secret, const uint return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); } #endif /* OQS_DIST_BUILD */ +#elif defined(OQS_ENABLE_KEM_saber_saber_aarch64) + return (OQS_STATUS) PQCLEAN_SABER_AARCH64_crypto_kem_dec(shared_secret, ciphertext, secret_key); #else return (OQS_STATUS) PQCLEAN_SABER_CLEAN_crypto_kem_dec(shared_secret, ciphertext, secret_key); #endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/LICENSE b/src/kem/saber/pqclean_firesaber_aarch64/LICENSE new file mode 100644 index 0000000000..0e259d42c9 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/src/kem/saber/pqclean_firesaber_aarch64/NTT.h b/src/kem/saber/pqclean_firesaber_aarch64/NTT.h new file mode 100644 index 0000000000..9af095af60 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/NTT.h @@ -0,0 +1,50 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "NTT_params.h" + +extern void PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants, const uint32_t *_inv_twist_const); +extern void PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul(uint32_t *src1, const uint32_t *src2, const uint32_t *src2_asymmetric, const uint32_t *_constants); +extern void PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended(uint32_t *des, const uint32_t *src1, const uint32_t *src2_extended, const uint32_t *_constants); + +#define NTT(in) { \ + PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + } + +#define NTT_heavy(in_asymmetric, in) { \ + NTT(in); \ + PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended(in_asymmetric, in, pre_asymmetric_table_Q1_extended, constants); \ + } + +#define iNTT(in) { \ + PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ + PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants, inv_twist_table_all_Q1_extended); \ + } + +static const uint32_t constants[16] = { + Q1, Q1prime2 +}; + +static const uint32_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, -119635792, -1424544, 1027317558, 12232619, -496739340, -5914844, -253524894, -3018807, 9103545, 108399, 42771771, 509298, 283911363, 3380629, 0, 0, -66089826, -786954, -259955382, -3095377, -643539471, -7662843, -332278086, -3956548, 703146656, 8372606, -881793531, -10499815, 304160806, 3621746, 0, 0, 34506365, 410879, 663387313, 7899178, -615166382, -7324995, 242706356, 2889987, -1016509854, -12103928, -410776309, -4891253, -1039822114, -12381515, 0, 0, 770061100, 9169379, 176271869, 2098929, 377015451, 4489251, -777437559, -9257213, 185186875, 2205083, -476967921, -5679419, 111859832, 1331953, 0, 0, 267484771, 3185032, -241571930, -2876479, -116066229, -1382040, 605105697, 7205199, 246868243, 2939544, -801225576, -9540465, -29401110, -350089, 0, 0, 461101573, 5490493, -659878385, -7857396, -813049292, -9681254, -610503208, -7269469, 754028719, 8978476, -513464823, -6114000, 974898460, 11608447, 0, 0, -65601052, -781134, 122588677, 1459705, 406381289, 4838920, -584016855, -6954087, 1066347183, 12697358, -347834458, -4141783, -592155281, -7050994, 0, 0, 242486240, 2887366, 1001287142, 11922666, 375772353, 4474449, 752256115, 8957369, 322396534, 3838885, 525597088, 6258463, -971930207, -11573103, 0, 0, -983711428, -11713386, 6721989, 80041, -138847220, -1653301, 687033653, 8180743, -438460075, -5220893, 714691721, 8510077, -689918177, -8215090, 0, 0 +}; + +static const uint32_t pre_asymmetric_table_Q1_extended[NTT_N << 3] = { + -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, -29401110, -350089, -29401110, -350089, -29401110, -350089, -29401110, -350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090 + }; + +static const uint32_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, -283911363, -3380629, 983711428, 11713386, -242486240, -2887366, 138847220, 1653301, -375772353, -4474449, -6721989, -80041, -1001287142, -11922666, 0, 0, 496739340, 5914844, -283911363, -3380629, -42771771, -509298, 983711428, 11713386, 65601052, 781134, -242486240, -2887366, -461101573, -5490493, 0, 0, -9103545, -108399, -267484771, -3185032, -770061100, -9169379, 116066229, 1382040, -377015451, -4489251, 241571930, 2876479, -176271869, -2098929, 0, 0, 119635792, 1424544, 496739340, 5914844, -1027317558, -12232619, -283911363, -3380629, -9103545, -108399, -42771771, -509298, 253524894, 3018807, 0, 0, -42771771, -509298, 65601052, 781134, -461101573, -5490493, -406381289, -4838920, 813049292, 9681254, -122588677, -1459705, 659878385, 7857396, 0, 0, -1027317558, -12232619, -9103545, -108399, 253524894, 3018807, -267484771, -3185032, -34506365, -410879, -770061100, -9169379, 66089826, 786954, 0, 0, 253524894, 3018807, -34506365, -410879, 66089826, 786954, 615166382, 7324995, 643539471, 7662843, -663387313, -7899178, 259955382, 3095377, 0, 0 +}; + +static const uint32_t inv_twist_table_all_Q1_extended[ARRAY_N << 1] = { + -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, 48233192, 574329, 48233192, 574329, 48233192, 574329, 48233192, 574329, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, -38469886, -458074, -38469886, -458074, -38469886, -458074, -38469886, -458074, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, -29941449, -356523, -29941449, -356523, -29941449, -356523, -29941449, -356523, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, -7172803, -85409, -7172803, -85409, -7172803, -85409, -7172803, -85409, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, -2971193, -35379, -2971193, -35379, -2971193, -35379, -2971193, -35379, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -39398809, -469135, -39398809, -469135, -39398809, -469135, -39398809, -469135, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742 + }; + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/NTT_params.h b/src/kem/saber/pqclean_firesaber_aarch64/NTT_params.h new file mode 100644 index 0000000000..25624db6c9 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/NTT_params.h @@ -0,0 +1,32 @@ +#ifndef NTT_PARAMS_H +#define NTT_PARAMS_H + +#define ARRAY_N 256 + +#define NTT_N 64 +#define LOGNTT_N 6 + +// Q1 +#define Q1 25570817 +// omegaQ1 = 3^( (Q1 - 1) / (NTT_N << 1) ) mod Q1 +#define omegaQ1 21614269 +// invomegaQ1 = omegaQ^{-1} mod Q1 +#define invomegaQ1 8215090 +// R = 2^32 below +// RmodQ1 = 2^32 mod^{+-} Q1 +#define RmodQ1 (-929960) +// Q1prime = Q1^{-1} mod^{+-} 2^32 +#define Q1prime (-155332095) +// invNQ1 = NTT_N^{-1} mod Q1 +#define invNQ1 25171273 +// R2modQ1 = 2^32 mod^{+-} Q1 +#define R2modQ1 (-929960) +// Q1prime2 = -Q1^{-1} mod^{+-} 2^32 +#define Q1prime2 155332095 + +#endif + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.c b/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.c new file mode 100644 index 0000000000..aadf31e42e --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.c @@ -0,0 +1,196 @@ +#include "NTT.h" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "cbd.h" +#include "fips202.h" +#include "fips202x2.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include + +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) + +extern void PQCLEAN_FIRESABER_AARCH64_asm_round(uint16_t des[SABER_N], uint32_t src[SABER_N]); +extern void PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg(uint16_t cipher[SABER_N], uint32_t src[SABER_N], uint16_t msg[SABER_N], int const_h1); +extern void PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg(uint16_t msg[SABER_N], uint32_t src[SABER_N], uint16_t cipher[SABER_N], int const_h2); + +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; + + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_s, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AARCH64_asm_13_to_32(&(A_NTT[j][i][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(s[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_FIRESABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(s[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_round(b[i], A_NTT[i][0]); + } + + POLVECq2BS(sk, s); + POLVECp2BS(pk, b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); +} + +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint32_t b_NTT[SABER_L][SABER_N]; + + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_sp, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AARCH64_asm_13_to_32(&(A_NTT[i][j][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(sp[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_FIRESABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(sp[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_round(bp[i], A_NTT[i][0]); + } + + + BS2POLVECp(pk, b); + BS2POLmsg(m, mp); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_16_to_32(&(b_NTT[i][0]), &(b[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg(vp, b_NTT[0], mp, h1); + + POLVECp2BS(ciphertext, bp); + POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); + + +} + +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]) { + + uint32_t b_NTT[SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + + BS2POLT(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, cm); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_13_to_32(&(s_NTT[i][0]), sk + i * SABER_POLYBYTES); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AARCH64_asm_10_to_32(&(b_NTT[i][0]), ciphertext + i * (SABER_EP * SABER_N / 8)); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg(v, b_NTT[0], cm, h2); + + POLmsg2BS(m, v); +} + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.h b/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.h new file mode 100644 index 0000000000..0b74c2fca0 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/SABER_indcpa.h @@ -0,0 +1,14 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include "SABER_params.h" +#include + +#define indcpa_kem_keypair SABER_NAMESPACE(indcpa_kem_keypair) +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); +#define indcpa_kem_enc SABER_NAMESPACE(indcpa_kem_enc) +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]); +#define indcpa_kem_dec SABER_NAMESPACE(indcpa_kem_dec) +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/SABER_params.h b/src/kem/saber/pqclean_firesaber_aarch64/SABER_params.h new file mode 100644 index 0000000000..0d9111c4cd --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/SABER_params.h @@ -0,0 +1,48 @@ +#ifndef PARAMS_H +#define PARAMS_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#define SABER_NAMESPACE(s) PQCLEAN_FIRESABER_AARCH64_##s +#define SABER_L 4 + +/* Don't change anything below this line */ +#define SABER_MU 6 +#define SABER_ET 6 + +#define SABER_EQ 13 +#define SABER_EP 10 +#define SABER_N 256 + +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 + +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) + +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) + +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/__asm_NTT.S b/src/kem/saber/pqclean_firesaber_aarch64/__asm_NTT.S new file mode 100644 index 0000000000..d39cfa15ec --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/__asm_NTT.S @@ -0,0 +1,309 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top +.global _PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top: +_PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + src8 .req x8 + src9 .req x9 + src10 .req x10 + src11 .req x11 + src12 .req x12 + src13 .req x13 + src14 .req x14 + src15 .req x15 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add src1, src0, #64 + add src2, src0, #128 + + add src3, src0, #192 + add src4, src0, #256 + + add src5, src0, #320 + add src6, src0, #384 + + add src7, src0, #448 + add src8, src0, #512 + + add src9, src0, #576 + add src10, src0, #640 + + add src11, src0, #704 + add src12, src0, #768 + + add src13, src0, #832 + add src14, src0, #896 + + add src15, src0, #960 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [ src0] + ld1 { v2.4S}, [ src2] + ld1 { v4.4S}, [ src4] + ld1 { v6.4S}, [ src6] + ld1 { v8.4S}, [ src8] + ld1 {v10.4S}, [src10] + ld1 {v12.4S}, [src12] + ld1 {v14.4S}, [src14] + + ld1 { v1.4S}, [ src1] + ld1 { v3.4S}, [ src3] + ld1 { v5.4S}, [ src5] + ld1 { v7.4S}, [ src7] + ld1 { v9.4S}, [ src9] + ld1 {v11.4S}, [src11] + ld1 {v13.4S}, [src13] + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _ntt_top_loop: + + st1 { v0.4S}, [ src0], #16 + ld1 { v0.4S}, [ src0] + st1 { v2.4S}, [ src2], #16 + ld1 { v2.4S}, [ src2] + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v8.4S}, [ src8], #16 + ld1 { v8.4S}, [ src8] + st1 {v10.4S}, [src10], #16 + ld1 {v10.4S}, [src10] + st1 {v12.4S}, [src12], #16 + ld1 {v12.4S}, [src12] + st1 {v14.4S}, [src14], #16 + ld1 {v14.4S}, [src14] + + st1 { v1.4S}, [ src1], #16 + ld1 { v1.4S}, [ src1] + st1 { v3.4S}, [ src3], #16 + ld1 { v3.4S}, [ src3] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + st1 { v9.4S}, [ src9], #16 + ld1 { v9.4S}, [ src9] + st1 {v11.4S}, [src11], #16 + ld1 {v11.4S}, [src11] + st1 {v13.4S}, [src13], #16 + ld1 {v13.4S}, [src13] + st1 {v15.4S}, [src15], #16 + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_top_loop + + st1 { v0.4S}, [ src0], #16 + st1 { v2.4S}, [ src2], #16 + st1 { v4.4S}, [ src4], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v8.4S}, [ src8], #16 + st1 {v10.4S}, [src10], #16 + st1 {v12.4S}, [src12], #16 + st1 {v14.4S}, [src14], #16 + + st1 { v1.4S}, [ src1], #16 + st1 { v3.4S}, [ src3], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v7.4S}, [ src7], #16 + st1 { v9.4S}, [ src9], #16 + st1 {v11.4S}, [src11], #16 + st1 {v13.4S}, [src13], #16 + st1 {v15.4S}, [src15], #16 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq src8 + .unreq src9 + .unreq src10 + .unreq src11 + .unreq src12 + .unreq src13 + .unreq src14 + .unreq src15 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot +.global _PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot: +_PQCLEAN_FIRESABER_AARCH64_asm_ntt_SIMD_bot: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + table0 .req x27 + table1 .req x28 + counter .req x19 + + ldr Q, [x2] + + add table0, x1, #64 + add table1, x1, #320 + + add src1, src0, #0 + add src2, src0, #512 + add src3, src0, #512 + + ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 + ld1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src2], #64 + ld1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src0], #64 + ld1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src2], #64 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + mov counter, #3 + _ntt_bot_loop: + + st1 { v0.4S}, [src1], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [src1], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [src1], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [src1], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [src1], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [src1], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [src1], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [src1], #16 + ld1 { v7.4S}, [src0], #16 + st1 { v8.4S}, [src3], #16 + ld1 { v8.4S}, [src2], #16 + st1 { v9.4S}, [src3], #16 + ld1 { v9.4S}, [src2], #16 + st1 {v10.4S}, [src3], #16 + ld1 {v10.4S}, [src2], #16 + st1 {v11.4S}, [src3], #16 + ld1 {v11.4S}, [src2], #16 + st1 {v12.4S}, [src3], #16 + ld1 {v12.4S}, [src2], #16 + st1 {v13.4S}, [src3], #16 + ld1 {v13.4S}, [src2], #16 + st1 {v14.4S}, [src3], #16 + ld1 {v14.4S}, [src2], #16 + st1 {v15.4S}, [src3], #16 + ld1 {v15.4S}, [src2], #16 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_bot_loop + + st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src1], #64 + st1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src3], #64 + st1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src1], #64 + st1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src3], #64 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq table0 + .unreq table1 + pop_all + + br lr + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/__asm_iNTT.S b/src/kem/saber/pqclean_firesaber_aarch64/__asm_iNTT.S new file mode 100644 index 0000000000..0b38928e9d --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/__asm_iNTT.S @@ -0,0 +1,472 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top +.global _PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top: +_PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + des0 .req x1 + src1 .req x2 + des1 .req x3 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add des0, src0, #0 + add src1, src0, #512 + add des1, src0, #512 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [src0], #16 + ld1 { v1.4S}, [src0], #16 + ld1 { v2.4S}, [src0], #16 + ld1 { v3.4S}, [src0], #16 + ld1 { v4.4S}, [src0], #16 + ld1 { v5.4S}, [src0], #16 + ld1 { v6.4S}, [src0], #16 + ld1 { v7.4S}, [src0], #16 + + ld1 { v8.4S}, [src1], #16 + ld1 { v9.4S}, [src1], #16 + ld1 {v10.4S}, [src1], #16 + ld1 {v11.4S}, [src1], #16 + ld1 {v12.4S}, [src1], #16 + ld1 {v13.4S}, [src1], #16 + ld1 {v14.4S}, [src1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _intt_top_loop: + + st1 { v0.4S}, [des0], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [des0], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [des0], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [des0], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [des0], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [des0], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [des0], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [des0], #16 + ld1 { v7.4S}, [src0], #16 + + st1 { v8.4S}, [des1], #16 + ld1 { v8.4S}, [src1], #16 + st1 { v9.4S}, [des1], #16 + ld1 { v9.4S}, [src1], #16 + st1 {v10.4S}, [des1], #16 + ld1 {v10.4S}, [src1], #16 + st1 {v11.4S}, [des1], #16 + ld1 {v11.4S}, [src1], #16 + st1 {v12.4S}, [des1], #16 + ld1 {v12.4S}, [src1], #16 + st1 {v13.4S}, [des1], #16 + ld1 {v13.4S}, [src1], #16 + st1 {v14.4S}, [des1], #16 + ld1 {v14.4S}, [src1], #16 + st1 {v15.4S}, [des1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + + st1 { v0.4S}, [des0], #16 + st1 { v1.4S}, [des0], #16 + st1 { v2.4S}, [des0], #16 + st1 { v3.4S}, [des0], #16 + st1 { v4.4S}, [des0], #16 + st1 { v5.4S}, [des0], #16 + st1 { v6.4S}, [des0], #16 + st1 { v7.4S}, [des0], #16 + + st1 { v8.4S}, [des1], #16 + st1 { v9.4S}, [des1], #16 + st1 {v10.4S}, [des1], #16 + st1 {v11.4S}, [des1], #16 + st1 {v12.4S}, [des1], #16 + st1 {v13.4S}, [des1], #16 + st1 {v14.4S}, [des1], #16 + st1 {v15.4S}, [des1], #16 + + .unreq Q + .unreq src0 + .unreq des0 + .unreq src1 + .unreq des1 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot +.global _PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot: +_PQCLEAN_FIRESABER_AARCH64_asm_intt_SIMD_bot: + + push_all + Q .req w20 + Qhalf .req w21 + nQhalf .req w22 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + table .req x28 + twistT0 .req x8 + twistT1 .req x9 + twistT2 .req x10 + twistT3 .req x11 + twistT4 .req x12 + twistT5 .req x13 + twistT6 .req x14 + twistT7 .req x15 + counter .req x19 + + add twistT0, x3, #256*0 + add twistT1, x3, #256*1 + add twistT2, x3, #256*2 + add twistT3, x3, #256*3 + add twistT4, x3, #256*4 + add twistT5, x3, #256*5 + add twistT6, x3, #256*6 + add twistT7, x3, #256*7 + + ldr Q, [x2] + lsr Qhalf, Q, #1 + neg nQhalf, Qhalf + + add table, x1, #64 + + add src1, src0, #128 + add src2, src0, #256 + add src3, src0, #384 + add src4, src0, #512 + add src5, src0, #640 + add src6, src0, #768 + add src7, src0, #896 + + ld1 { v0.4S}, [ src0] + ld1 { v1.4S}, [ src1] + ld1 { v2.4S}, [ src2] + ld1 { v3.4S}, [ src3] + ld1 { v4.4S}, [ src4] + ld1 { v5.4S}, [ src5] + ld1 { v6.4S}, [ src6] + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + mov counter, #7 + _intt_bot_loop: + + cmge v22.4S, v26.4S, v6.4S + ld1 { v0.4S}, [ src0] + cmge v23.4S, v26.4S, v7.4S + ld1 { v1.4S}, [ src1] + cmgt v20.4S, v6.4S, v25.4S + ld1 { v2.4S}, [ src2] + cmgt v21.4S, v7.4S, v25.4S + ld1 { v3.4S}, [ src3] + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + sub counter, counter, #1 + cbnz counter, _intt_bot_loop + + cmge v22.4S, v26.4S, v6.4S + cmge v23.4S, v26.4S, v7.4S + cmgt v20.4S, v6.4S, v25.4S + cmgt v21.4S, v7.4S, v25.4S + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v7.4S}, [ src7], #16 + + .unreq Q + .unreq Qhalf + .unreq nQhalf + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq table + .unreq twistT0 + .unreq twistT1 + .unreq twistT2 + .unreq twistT3 + .unreq twistT4 + .unreq twistT5 + .unreq twistT6 + .unreq twistT7 + .unreq counter + pop_all + + br lr + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/__asm_mul.S b/src/kem/saber/pqclean_firesaber_aarch64/__asm_mul.S new file mode 100644 index 0000000000..e30a3f66d6 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/__asm_mul.S @@ -0,0 +1,255 @@ + +#include "macros.inc" +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul +.global _PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul: +_PQCLEAN_FIRESABER_AARCH64_asm_asymmetric_mul: + + push_all + + ldr w28, [x3, #0] + ldr w27, [x3, #4] + + dup v28.4S, w28 + dup v29.4S, w27 + + add x11, x0, #0 + + add x4, x0, #1024 + add x5, x1, #1024 + add x6, x2, #1024 + +.if SABER_L > 2 + add x8, x0, #2048 + add x9, x1, #2048 + add x10, x2, #2048 +.endif + +.if SABER_L > 3 + add x12, x0, #3072 + add x13, x1, #3072 + add x14, x2, #3072 +.endif + + mov x16, #16 + _asymmetric_loop: + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x0], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x1], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x2], #64 + + _4x4_asymmetric smull, smull2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x4], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x5], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x6], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + +.if SABER_L > 2 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x8], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x9], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x10], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + +.if SABER_L > 3 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [x12], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [x13], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x14], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + + qq_montgomery v24, v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v0, v1, v2, v3, v29, v28 + + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x11], #64 + + sub x16, x16, #1 + cbnz x16, _asymmetric_loop + + pop_all + + br lr + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended +.global _PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended: +_PQCLEAN_FIRESABER_AARCH64_asm_point_mul_extended: + + push_all + + ldr w20, [x3] + + ld1 { v0.4S}, [x1], #16 + ld1 { v1.4S}, [x1], #16 + ld1 { v2.4S}, [x1], #16 + ld1 { v3.4S}, [x1], #16 + + ld2 { v4.4S, v5.4S}, [x2], #32 + ld2 { v6.4S, v7.4S}, [x2], #32 + ld2 { v8.4S, v9.4S}, [x2], #32 + ld2 {v10.4S, v11.4S}, [x2], #32 + + sqrdmulh v12.4S, v0.4S, v4.4S + sqrdmulh v13.4S, v1.4S, v6.4S + sqrdmulh v14.4S, v2.4S, v8.4S + sqrdmulh v15.4S, v3.4S, v10.4S + + mov x16, #7 + _point_mul_loop: + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + ld1 { v0.4S}, [x1], #16 + mul v17.4S, v17.4S, v23.4S + ld1 { v1.4S}, [x1], #16 + mul v18.4S, v18.4S, v25.4S + ld1 { v2.4S}, [x1], #16 + mul v19.4S, v19.4S, v27.4S + ld1 { v3.4S}, [x1], #16 + + mls v16.4S, v28.4S, v20.4S + ld2 { v4.4S, v5.4S}, [x2], #32 + mls v17.4S, v29.4S, v20.4S + ld2 { v6.4S, v7.4S}, [x2], #32 + mls v18.4S, v30.4S, v20.4S + ld2 { v8.4S, v9.4S}, [x2], #32 + mls v19.4S, v31.4S, v20.4S + ld2 {v10.4S, v11.4S}, [x2], #32 + + st1 {v16.4S}, [x0], #16 + sqrdmulh v12.4S, v0.4S, v4.4S + st1 {v17.4S}, [x0], #16 + sqrdmulh v13.4S, v1.4S, v6.4S + st1 {v18.4S}, [x0], #16 + sqrdmulh v14.4S, v2.4S, v8.4S + st1 {v19.4S}, [x0], #16 + sqrdmulh v15.4S, v3.4S, v10.4S + + sub x16, x16, #1 + cbnz x16, _point_mul_loop + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + mul v17.4S, v17.4S, v23.4S + mul v18.4S, v18.4S, v25.4S + mul v19.4S, v19.4S, v27.4S + + mls v16.4S, v28.4S, v20.4S + mls v17.4S, v29.4S, v20.4S + mls v18.4S, v30.4S, v20.4S + mls v19.4S, v31.4S, v20.4S + + st1 {v16.4S}, [x0], #16 + st1 {v17.4S}, [x0], #16 + st1 {v18.4S}, [x0], #16 + st1 {v19.4S}, [x0], #16 + + pop_all + + br lr + + + + + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/__asm_narrow.S b/src/kem/saber/pqclean_firesaber_aarch64/__asm_narrow.S new file mode 100644 index 0000000000..1909c7da27 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/__asm_narrow.S @@ -0,0 +1,247 @@ + +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_round +.global _PQCLEAN_FIRESABER_AARCH64_asm_round +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_round, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_round: +_PQCLEAN_FIRESABER_AARCH64_asm_round: + + + .equ srv, (SABER_EQ-SABER_EP) + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + mov x7, #7 + _round_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _round_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg +.global _PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg: +_PQCLEAN_FIRESABER_AARCH64_asm_enc_add_msg: + + .equ srv, (SABER_EP-SABER_ET) + .equ slv, (SABER_EP-1) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _enc_add_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _enc_add_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg +.global _PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg: +_PQCLEAN_FIRESABER_AARCH64_asm_dec_get_msg: + + .equ srv, (SABER_EP-1) + .equ slv, (SABER_EP-SABER_ET) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _dec_get_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _dec_get_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/__asm_pack_unpack.S b/src/kem/saber/pqclean_firesaber_aarch64/__asm_pack_unpack.S new file mode 100644 index 0000000000..f7864be896 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/__asm_pack_unpack.S @@ -0,0 +1,345 @@ + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_1_to_16 +.global _PQCLEAN_FIRESABER_AARCH64_asm_1_to_16 +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_1_to_16, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_1_to_16: +_PQCLEAN_FIRESABER_AARCH64_asm_1_to_16: + + mov x15, #8 + _1_to_16_outer_loop: + + ldr w2, [x1], #4 + + mov x11, #4 + _1_to_16_inner_loop: + + sbfx w3, w2, #0, #1 + strh w3, [x0], #2 + sbfx w4, w2, #1, #1 + strh w4, [x0], #2 + sbfx w5, w2, #2, #1 + strh w5, [x0], #2 + sbfx w6, w2, #3, #1 + strh w6, [x0], #2 + sbfx w7, w2, #4, #1 + strh w7, [x0], #2 + sbfx w8, w2, #5, #1 + strh w8, [x0], #2 + sbfx w9, w2, #6, #1 + strh w9, [x0], #2 + sbfx w10, w2, #7, #1 + strh w10, [x0], #2 + + lsr w2, w2, #8 + + sub x11, x11, #1 + cbnz x11, _1_to_16_inner_loop + + sub x15, x15, #1 + cbnz x15, _1_to_16_outer_loop + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_4_to_16 +.global _PQCLEAN_FIRESABER_AARCH64_asm_4_to_16 +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_4_to_16, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_4_to_16: +_PQCLEAN_FIRESABER_AARCH64_asm_4_to_16: + + mov x7, #32 + _4_to_16_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + strh w3, [x0], #2 + sbfx w4, w2, #4, #4 + strh w4, [x0], #2 + sbfx w5, w2, #8, #4 + strh w5, [x0], #2 + sbfx w6, w2, #12, #4 + strh w6, [x0], #2 + + sbfx w3, w2, #16, #4 + strh w3, [x0], #2 + sbfx w4, w2, #20, #4 + strh w4, [x0], #2 + sbfx w5, w2, #24, #4 + strh w5, [x0], #2 + sbfx w6, w2, #28, #4 + strh w6, [x0], #2 + + sub x7, x7, #1 + cbnz x7, _4_to_16_loop + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_10_to_32 +.global _PQCLEAN_FIRESABER_AARCH64_asm_10_to_32 +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_10_to_32, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_10_to_32: +_PQCLEAN_FIRESABER_AARCH64_asm_10_to_32: + + mov x7, #16 + _10_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + str w3, [x0], #4 + sbfx w4, w2, #10, #10 + str w4, [x0], #4 + sbfx w5, w2, #20, #10 + str w5, [x0], #4 + lsr w6, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #2 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #8, #10 + str w4, [x0], #4 + sbfx w5, w2, #18, #10 + str w5, [x0], #4 + lsr w6, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #4 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #6, #10 + str w4, [x0], #4 + sbfx w5, w2, #16, #10 + str w5, [x0], #4 + lsr w6, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #6 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #4, #10 + str w4, [x0], #4 + sbfx w5, w2, #14, #10 + str w5, [x0], #4 + lsr w6, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #8 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #2, #10 + str w4, [x0], #4 + sbfx w5, w2, #12, #10 + str w5, [x0], #4 + sbfx w6, w2, #22, #10 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _10_to_32_loop + + br lr + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_13_to_32 +.global _PQCLEAN_FIRESABER_AARCH64_asm_13_to_32 +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_13_to_32, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_13_to_32: +_PQCLEAN_FIRESABER_AARCH64_asm_13_to_32: + + mov x7, #8 + _13_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #13 + str w3, [x0], #4 + sbfx w4, w2, #13, #13 + str w4, [x0], #4 + lsr w5, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #7 + lsl w3, w3, #6 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #7, #13 + str w4, [x0], #4 + lsr w5, w2, #20 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #1 + lsl w3, w3, #12 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #1, #13 + str w4, [x0], #4 + sbfx w5, w2, #14, #13 + str w5, [x0], #4 + lsr w5, w2, #27 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #5 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #8, #13 + str w4, [x0], #4 + lsr w5, w2, #21 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #11 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #2, #13 + str w4, [x0], #4 + sbfx w5, w2, #15, #13 + str w5, [x0], #4 + lsr w5, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #9 + lsl w3, w3, #4 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #9, #13 + str w4, [x0], #4 + lsr w5, w2, #22 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #3 + lsl w3, w3, #10 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #3, #13 + str w4, [x0], #4 + sbfx w5, w2, #16, #13 + str w5, [x0], #4 + lsr w5, w2, #29 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + lsl w3, w3, #3 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #10, #13 + str w4, [x0], #4 + lsr w5, w2, #23 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #9 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #4, #13 + str w4, [x0], #4 + sbfx w5, w2, #17, #13 + str w5, [x0], #4 + lsr w5, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #11 + lsl w3, w3, #2 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #11, #13 + str w4, [x0], #4 + lsr w5, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #5 + lsl w3, w3, #8 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #5, #13 + str w4, [x0], #4 + sbfx w5, w2, #18, #13 + str w5, [x0], #4 + lsr w5, w2, #31 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #12 + lsl w3, w3, #1 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #12, #13 + str w4, [x0], #4 + lsr w5, w2, #25 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #7 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #6, #13 + str w4, [x0], #4 + sbfx w5, w2, #19, #13 + str w5, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _13_to_32_loop + + br lr + + +.align 2 +.global PQCLEAN_FIRESABER_AARCH64_asm_16_to_32 +.global _PQCLEAN_FIRESABER_AARCH64_asm_16_to_32 +#ifndef __clang__ +.type PQCLEAN_FIRESABER_AARCH64_asm_16_to_32, %function +#endif +PQCLEAN_FIRESABER_AARCH64_asm_16_to_32: +_PQCLEAN_FIRESABER_AARCH64_asm_16_to_32: + + mov x7, #128 + _sbfx_loop: + + ldr w4, [x1], #4 + sbfx w5, w4, #0, #13 + sbfx w6, w4, #16, #13 + str w5, [x0], #4 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _sbfx_loop + + br lr + + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/api.h b/src/kem/saber/pqclean_firesaber_aarch64/api.h new file mode 100644 index 0000000000..d06475eb25 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_FIRESABER_AARCH64_API_H +#define PQCLEAN_FIRESABER_AARCH64_API_H + + +#define PQCLEAN_FIRESABER_AARCH64_CRYPTO_ALGNAME "FireSaber" +#define PQCLEAN_FIRESABER_AARCH64_CRYPTO_BYTES 32 +#define PQCLEAN_FIRESABER_AARCH64_CRYPTO_CIPHERTEXTBYTES 1472 +#define PQCLEAN_FIRESABER_AARCH64_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_FIRESABER_AARCH64_CRYPTO_SECRETKEYBYTES 3040 + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_FIRESABER_AARCH64_API_H */ diff --git a/src/kem/saber/pqclean_firesaber_aarch64/cbd.c b/src/kem/saber/pqclean_firesaber_aarch64/cbd.c new file mode 100644 index 0000000000..da2e96be26 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/cbd.c @@ -0,0 +1,66 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. + * Copyright (c) 2020 by Cryptographic Engineering Research Group (CERG) + * ECE Department, George Mason University + * Fairfax, VA, U.S.A. + * Author: Duc Tri Nguyen +=============================================================================*/ + +#include "cbd.h" +#include + +#define vload4(c, ptr) c = vld4q_u8(ptr); +#define vstore4(ptr, c) vst4q_u16(ptr, c); + +// c = a & b +#define vand8(c, a, b) c = vandq_u8(a, b); + +// c = a >> n +#define vsr8(c, a, n) c = vshrq_n_u8(a, n); + +// c = a + b +#define vadd8(c, a, b) c = vaddq_u8(a, b); + +// low c = (uint16_t) (a - b) +#define vsubll8(c, a, b) c = vsubl_u8(a, b); + +// high c = (uint16_t) (a - b) +#define vsublh8(c, a, b) c = vsubl_high_u8(a, b); + +static uint32_t load24_littleendian(const uint8_t x[3]) { + uint32_t r; + r = (uint32_t)x[0]; + r |= (uint32_t)x[1] << 8; + r |= (uint32_t)x[2] << 16; + return r; +} + +static void cbd3(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + unsigned int i, j; + uint32_t t, d; + int16_t a, b; + + for (i = 0; i < SABER_N / 4; i++) { + t = load24_littleendian(buf + 3 * i); + d = t & 0x00249249; + d += (t >> 1) & 0x00249249; + d += (t >> 2) & 0x00249249; + + for (j = 0; j < 4; j++) { + a = (d >> (6 * j + 0)) & 0x7; + b = (d >> (6 * j + 3)) & 0x7; + s[4 * i + j] = a - b; + } + } +} + + + + +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + cbd3(s, buf); +} diff --git a/src/kem/saber/pqclean_firesaber_aarch64/cbd.h b/src/kem/saber/pqclean_firesaber_aarch64/cbd.h new file mode 100644 index 0000000000..6e5c360cb3 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/cbd.h @@ -0,0 +1,17 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + +#include "SABER_params.h" +#include + +#define cbd SABER_NAMESPACE(cbd) +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.c b/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.c new file mode 100644 index 0000000000..3924900e9e --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.c @@ -0,0 +1,646 @@ +#include "fips202x2.h" +#include +#include + + +#define NROUNDS 24 + +// Define NEON operation +// c = load(ptr) +#define vload(ptr) vld1q_u64(ptr); +// ptr <= c; +#define vstore(ptr, c) vst1q_u64(ptr, c); +// c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); +// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) +#define vROL(out, a, offset) \ + (out) = vshlq_n_u64(a, offset); \ + (out) = vsriq_n_u64(out, a, 64 - (offset)); +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR4(out, a, b, c, d, e) \ + (out) = veorq_u64(a, b); \ + (out) = veorq_u64(out, c); \ + (out) = veorq_u64(out, d); \ + (out) = veorq_u64(out, e); +// Not And c = ~a & b +// #define vbic(c, a, b) c = vbicq_u64(b, a); +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + (out) = vbicq_u64(c, b); \ + (out) = veorq_u64(out, a); +// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support +#define vrxor(c, a, b) c = vrax1q_u64(a, b); +// End Define + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/************************************************* +* Name: KeccakF1600_StatePermutex2 +* +* Description: The Keccak F1600 Permutation +* +* Arguments: - uint64_t *state: pointer to input/output Keccak state +**************************************************/ +static inline +void KeccakF1600_StatePermutex2(v128 state[25]) { + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (int round = 0; round < NROUNDS; round += 2) { + // prepareTheta + vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR4(BCe, Abe, Age, Ake, Ame, Ase); + vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Aba, Aba, Da); + vxor(Age, Age, De); + vROL(BCe, Age, 44); + vxor(Aki, Aki, Di); + vROL(BCi, Aki, 43); + vxor(Amo, Amo, Do); + vROL(BCo, Amo, 21); + vxor(Asu, Asu, Du); + vROL(BCu, Asu, 14); + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vxor(Abo, Abo, Do); + vROL(BCa, Abo, 28); + vxor(Agu, Agu, Du); + vROL(BCe, Agu, 20); + vxor(Aka, Aka, Da); + vROL(BCi, Aka, 3); + vxor(Ame, Ame, De); + vROL(BCo, Ame, 45); + vxor(Asi, Asi, Di); + vROL(BCu, Asi, 61); + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vxor(Abe, Abe, De); + vROL(BCa, Abe, 1); + vxor(Agi, Agi, Di); + vROL(BCe, Agi, 6); + vxor(Ako, Ako, Do); + vROL(BCi, Ako, 25); + vxor(Amu, Amu, Du); + vROL(BCo, Amu, 8); + vxor(Asa, Asa, Da); + vROL(BCu, Asa, 18); + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vxor(Abu, Abu, Du); + vROL(BCa, Abu, 27); + vxor(Aga, Aga, Da); + vROL(BCe, Aga, 36); + vxor(Ake, Ake, De); + vROL(BCi, Ake, 10); + vxor(Ami, Ami, Di); + vROL(BCo, Ami, 15); + vxor(Aso, Aso, Do); + vROL(BCu, Aso, 56); + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vxor(Abi, Abi, Di); + vROL(BCa, Abi, 62); + vxor(Ago, Ago, Do); + vROL(BCe, Ago, 55); + vxor(Aku, Aku, Du); + vROL(BCi, Aku, 39); + vxor(Ama, Ama, Da); + vROL(BCo, Ama, 41); + vxor(Ase, Ase, De); + vROL(BCu, Ase, 2); + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Eba, Eba, Da); + vxor(Ege, Ege, De); + vROL(BCe, Ege, 44); + vxor(Eki, Eki, Di); + vROL(BCi, Eki, 43); + vxor(Emo, Emo, Do); + vROL(BCo, Emo, 21); + vxor(Esu, Esu, Du); + vROL(BCu, Esu, 14); + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vxor(Ebo, Ebo, Do); + vROL(BCa, Ebo, 28); + vxor(Egu, Egu, Du); + vROL(BCe, Egu, 20); + vxor(Eka, Eka, Da); + vROL(BCi, Eka, 3); + vxor(Eme, Eme, De); + vROL(BCo, Eme, 45); + vxor(Esi, Esi, Di); + vROL(BCu, Esi, 61); + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vxor(Ebe, Ebe, De); + vROL(BCa, Ebe, 1); + vxor(Egi, Egi, Di); + vROL(BCe, Egi, 6); + vxor(Eko, Eko, Do); + vROL(BCi, Eko, 25); + vxor(Emu, Emu, Du); + vROL(BCo, Emu, 8); + vxor(Esa, Esa, Da); + vROL(BCu, Esa, 18); + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vxor(Ebu, Ebu, Du); + vROL(BCa, Ebu, 27); + vxor(Ega, Ega, Da); + vROL(BCe, Ega, 36); + vxor(Eke, Eke, De); + vROL(BCi, Eke, 10); + vxor(Emi, Emi, Di); + vROL(BCo, Emi, 15); + vxor(Eso, Eso, Do); + vROL(BCu, Eso, 56); + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vxor(Ebi, Ebi, Di); + vROL(BCa, Ebi, 62); + vxor(Ego, Ego, Do); + vROL(BCe, Ego, 55); + vxor(Eku, Eku, Du); + vROL(BCi, Eku, 39); + vxor(Ema, Ema, Da); + vROL(BCo, Ema, 41); + vxor(Ese, Ese, De); + vROL(BCu, Ese, 2); + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +/************************************************* +* Name: keccakx2_absorb +* +* Description: Absorb step of Keccak; +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - const uint8_t *m: pointer to input to be absorbed into s +* - size_t mlen: length of input in bytes +* - uint8_t p: domain-separation byte for different +* Keccak-derived functions +**************************************************/ +static +void keccakx2_absorb(v128 s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen, + uint8_t p) { + size_t i, pos = 0; + + // Declare SIMD registers + v128 tmp, mask; + uint64x1_t a, b; + uint64x2_t a1, b1, atmp1, btmp1; + uint64x2x2_t a2, b2, atmp2, btmp2; + // End + + for (i = 0; i < 25; ++i) { + s[i] = vdupq_n_u64(0); + } + + // Load in0[i] to register, then in1[i] to register, exchange them + while (inlen >= r) { + for (i = 0; i < r / 8 - 1; i += 4) { + a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); + b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); + atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); + // AC = zip2(AB and CD) + btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); + btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); + + vxor(s[i + 0], s[i + 0], atmp2.val[0]); + vxor(s[i + 1], s[i + 1], btmp2.val[0]); + vxor(s[i + 2], s[i + 2], atmp2.val[1]); + vxor(s[i + 3], s[i + 3], btmp2.val[1]); + + pos += 8 * 2 * 2; + } + // Last iteration + i = r / 8 - 1; + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + pos += 8; + + KeccakF1600_StatePermutex2(s); + inlen -= r; + } + + i = 0; + while (inlen >= 16) { + a1 = vld1q_u64((uint64_t *)&in0[pos]); + b1 = vld1q_u64((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp1 = vzip1q_u64(a1, b1); + // AC = zip2(AB and CD) + btmp1 = vzip2q_u64(a1, b1); + + vxor(s[i + 0], s[i + 0], atmp1); + vxor(s[i + 1], s[i + 1], btmp1); + + i += 2; + pos += 8 * 2; + inlen -= 8 * 2; + } + + if (inlen >= 8) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + + i++; + pos += 8; + inlen -= 8; + } + + if (inlen) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); + tmp = vandq_u64(tmp, mask); + vxor(s[i], s[i], tmp); + } + + tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); + vxor(s[i], s[i], tmp); + + mask = vdupq_n_u64(1ULL << 63); + vxor(s[r / 8 - 1], s[r / 8 - 1], mask); +} + +/************************************************* +* Name: keccak_squeezeblocks +* +* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. +* Modifies the state. Can be called multiple times to keep +* squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed (written to h) +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - uint64_t *s: pointer to input/output Keccak state +**************************************************/ +static +void keccakx2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + unsigned int r, + v128 s[25]) { + unsigned int i; + + uint64x1_t a, b; + uint64x2x2_t a2, b2; + + while (nblocks > 0) { + KeccakF1600_StatePermutex2(s); + + for (i = 0; i < r / 8 - 1; i += 4) { + a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); + b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); + a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); + b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); + vst1q_u64_x2((uint64_t *)out0, a2); + vst1q_u64_x2((uint64_t *)out1, b2); + + out0 += 32; + out1 += 32; + } + + i = r / 8 - 1; + // Last iteration + a = vget_low_u64(s[i]); + b = vget_high_u64(s[i]); + vst1_u64((uint64_t *)out0, a); + vst1_u64((uint64_t *)out1, b); + + out0 += 8; + out1 += 8; + + --nblocks; + } +} + +/************************************************* +* Name: shake128x2_absorb +* +* Description: Absorb step of the SHAKE128 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of +* SHAKE128_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); +} + +/************************************************* +* Name: shake256_absorb +* +* Description: Absorb step of the SHAKE256 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake256_squeezeblocks +* +* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of +* SHAKE256_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); +} + +/************************************************* +* Name: shake128 +* +* Description: SHAKE128 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[2][SHAKE128_RATE]; + keccakx2_state state; + + shake128x2_absorb(&state, in0, in1, inlen); + shake128x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} + +/************************************************* +* Name: shake256 +* +* Description: SHAKE256 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[2][SHAKE256_RATE]; + keccakx2_state state; + + shake256x2_absorb(&state, in0, in1, inlen); + shake256x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} diff --git a/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.h b/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.h new file mode 100644 index 0000000000..11579f3015 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/fips202x2.h @@ -0,0 +1,54 @@ +#ifndef FIPS202X2_H +#define FIPS202X2_H + +#include "SABER_params.h" +#include +#include +#include "fips202.h" +typedef uint64x2_t v128; + +typedef struct { + v128 s[25]; +} keccakx2_state; + + +#define shake128x2_absorb SABER_NAMESPACE(shake128x2_absorb) +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake128x2_squeezeblocks SABER_NAMESPACE(shake128x2_squeezeblocks) +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake256x2_absorb SABER_NAMESPACE(shake256x2_absorb) +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2_squeezeblocks SABER_NAMESPACE(shake256x2_squeezeblocks) +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake128x2 SABER_NAMESPACE(shake128x2) +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2 SABER_NAMESPACE(shake256x2) +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/kem.c b/src/kem/saber/pqclean_firesaber_aarch64/kem.c new file mode 100644 index 0000000000..368ef77684 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/kem.c @@ -0,0 +1,84 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "fips202.h" +#include "kem.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + int i; + + indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { + + unsigned char kr[64]; // Will contain key, coins + unsigned char buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + indcpa_kem_enc(buf, kr + 32, pk, c); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { + int i, fail; + unsigned char cmp[SABER_BYTES_CCA_DEC]; + unsigned char buf[64]; + unsigned char kr[64]; // Will contain key, coins + const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + indcpa_kem_dec(sk, c, buf); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + indcpa_kem_enc(buf, kr + 32, pk, cmp); + + fail = verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/src/kem/saber/pqclean_firesaber_aarch64/kem.h b/src/kem/saber/pqclean_firesaber_aarch64/kem.h new file mode 100644 index 0000000000..7898f112be --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/kem.h @@ -0,0 +1,17 @@ +#ifndef KEM_H +#define KEM_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include + +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); +int PQCLEAN_FIRESABER_AARCH64_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/macros.inc b/src/kem/saber/pqclean_firesaber_aarch64/macros.inc new file mode 100644 index 0000000000..88c3675f29 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/macros.inc @@ -0,0 +1,57 @@ + +#ifndef MACROS_S +#define MACROS_S + +#include "macros_common.inc" + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro wrap_4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3, dS, qS, dD + + \mulacc \l0\dD, \a0\dS, \b0\dS + \mulacc2 \h0\dD, \a0\qS, \b0\qS + \mulacc \l1\dD, \a0\dS, \b1\dS + \mulacc2 \h1\dD, \a0\qS, \b1\qS + \mulacc \l2\dD, \a0\dS, \b2\dS + \mulacc2 \h2\dD, \a0\qS, \b2\qS + \mulacc \l3\dD, \a0\dS, \b3\dS + \mulacc2 \h3\dD, \a0\qS, \b3\qS + +.endm + +.macro _4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_4x4_asymmetric \mulacc, \mulacc2, \a0, \b0, \b1, \b2, \b3, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .2S, .4S, .2D +.endm + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_add_sub \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm + +#endif + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/macros_common.inc b/src/kem/saber/pqclean_firesaber_aarch64/macros_common.inc new file mode 100644 index 0000000000..26e7cbb5da --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/macros_common.inc @@ -0,0 +1,434 @@ + +#ifndef MACROS_COMMON +#define MACROS_COMMON + +// for ABI + +.macro push_all + + sub sp, sp, #(16*9) + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*6] + stp d12, d13, [sp, #16*7] + stp d14, d15, [sp, #16*8] + +.endm + +.macro pop_all + + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp d10, d11, [sp, #16*6] + ldp d12, d13, [sp, #16*7] + ldp d14, d15, [sp, #16*8] + add sp, sp, #(16*9) + +.endm + +// vector-scalar butterflies + +.macro wrap_dX_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + sub \b2\wX, \a2\wX, \t2\wX + sub \b3\wX, \a3\wX, \t3\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + add \a2\wX, \a2\wX, \t2\wX + add \a3\wX, \a3\wX, \t3\wX + +.endm + +.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b4\wX, \a4\wX, \t4\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b5\wX, \a5\wX, \t5\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b6\wX, \a6\wX, \t6\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + sub \b7\wX, \a7\wX, \t7\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a4\wX, \a4\wX, \t4\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a5\wX, \a5\wX, \t5\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a6\wX, \a6\wX, \t6\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + add \a7\wX, \a7\wX, \t7\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +// vector-vector butterflies + +.macro wrap_dX_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + mul \t1\wX, \b1\wX, \h1\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +// vector-scalar Barrett reduction + +.macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// vector-vector Barrett reduction + +.macro wrap_qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\wX + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\wX + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\wX + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// Montgomery multiplication + +.macro wrap_qX_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \b0\wX, \t0\wX, \z0\nX[\h0] + mul \b1\wX, \t1\wX, \z1\nX[\h1] + mul \b2\wX, \t2\wX, \z2\nX[\h2] + mul \b3\wX, \t3\wX, \z3\nX[\h3] + + sqrdmulh \t0\wX, \t0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \t1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \t2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \t3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +// Montgomery reduction with long + +.macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX + + uzp1 \t0\wX, \l0\wX, \h0\wX + uzp1 \t1\wX, \l1\wX, \h1\wX + uzp1 \t2\wX, \l2\wX, \h2\wX + uzp1 \t3\wX, \l3\wX, \h3\wX + + mul \t0\wX, \t0\wX, \Qprime\wX + mul \t1\wX, \t1\wX, \Qprime\wX + mul \t2\wX, \t2\wX, \Qprime\wX + mul \t3\wX, \t3\wX, \Qprime\wX + + smlal \l0\dwX, \t0\lX, \Q\lX + smlal2 \h0\dwX, \t0\wX, \Q\wX + smlal \l1\dwX, \t1\lX, \Q\lX + smlal2 \h1\dwX, \t1\wX, \Q\wX + smlal \l2\dwX, \t2\lX, \Q\lX + smlal2 \h2\dwX, \t2\wX, \Q\wX + smlal \l3\dwX, \t3\lX, \Q\lX + smlal2 \h3\dwX, \t3\wX, \Q\wX + + uzp2 \c0\wX, \l0\wX, \h0\wX + uzp2 \c1\wX, \l1\wX, \h1\wX + uzp2 \c2\wX, \l2\wX, \h2\wX + uzp2 \c3\wX, \l3\wX, \h3\wX + +.endm + +// add_sub, sub_add + +.macro wrap_qX_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + add \s0\wX, \a0\wX, \b0\wX + sub \t0\wX, \a0\wX, \b0\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s3\wX, \a3\wX, \b3\wX + sub \t3\wX, \a3\wX, \b3\wX + +.endm + +.macro wrap_qX_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + sub \t0\wX, \a0\wX, \b0\wX + add \s0\wX, \a0\wX, \b0\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t3\wX, \a3\wX, \b3\wX + add \s3\wX, \a3\wX, \b3\wX + +.endm + + +#endif + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.c b/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.c new file mode 100644 index 0000000000..05f8ee3616 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.c @@ -0,0 +1,191 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + + +#include "api.h" +#include "pack_unpack.h" +#include + +/* This function reduces its input mod T */ +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j; + const uint16_t *in = data; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = (uint8_t) ((in[0] & 0x3f) | (in[1] << 6)); + out[1] = (uint8_t) (((in[1] >> 2) & 0x0f) | (in[2] << 4)); + out[2] = (uint8_t) (((in[2] >> 4) & 0x03) | (in[3] << 2)); + in += 4; + out += 3; + } +} + +/* This function does NOT reduce its output mod T */ +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = in[0]; + out[1] = (in[0] >> 6) | (in[1] << 2); + out[2] = (in[1] >> 4) | (in[2] << 4); + out[3] = (in[2] >> 2); + in += 3; + out += 4; + } +} + +/* This function reduces its input mod q */ +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 8; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); + in += 8; + out += 13; + } +} + +/* This function sign-extends its output from q-bit to 16-bit. +This is needed by 16-bit NTTs */ +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]) { + size_t i; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int13_t { // bitfield struct to sign-extend q-bit to 16-bit. +signed int bits: + SABER_EQ; + } q0, q1, q2, q3, q4, q5, q6, q7; + + for (i = 0; i < SABER_N / 8; i++) { + q0.bits = (in[0]) | (in[1] << 8); + q1.bits = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + q2.bits = (in[3] >> 2) | (in[4] << 6); + q3.bits = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + q4.bits = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + q5.bits = (in[8] >> 1) | (in[9] << 7); + q6.bits = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + q7.bits = (in[11] >> 3) | (in[12] << 5); + out[0] = (int16_t)q0.bits; + out[1] = (int16_t)q1.bits; + out[2] = (int16_t)q2.bits; + out[3] = (int16_t)q3.bits; + out[4] = (int16_t)q4.bits; + out[5] = (int16_t)q5.bits; + out[6] = (int16_t)q6.bits; + out[7] = (int16_t)q7.bits; + in += 13; + out += 8; + } +} + +/* This function reduces its input mod p */ +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 4; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); + in += 4; + out += 5; + } +} + +/* This function sign-extends its output from p-bit to 16-bit. +This is needed by the NTT */ +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]) { + size_t j; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int10_t { // bitfield struct to sign-extend p-bit to 16-bit. +signed int bits: + SABER_EP; + } p0, p1, p2, p3; + + for (j = 0; j < SABER_N / 4; j++) { + p0.bits = (in[0]) | (in[1] << 8); + p1.bits = (in[1] >> 2) | (in[2] << 6); + p2.bits = (in[2] >> 4) | (in[3] << 4); + p3.bits = (in[3] >> 6) | (in[4] << 2); + out[0] = (int16_t)p0.bits; + out[1] = (int16_t)p1.bits; + out[2] = (int16_t)p2.bits; + out[3] = (int16_t)p3.bits; + in += 5; + out += 4; + } +} + +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]) { + PQCLEAN_FIRESABER_AARCH64_asm_1_to_16(&(data[0]), &(bytes[0])); +} + +/* This function reduces its input mod 2 */ +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + uint8_t byte; + for (j = 0; j < SABER_KEYBYTES; j++) { + byte = 0; + for (i = 0; i < 8; i++) { + byte |= ((data[j * 8 + i] & 0x01) << i); + } + bytes[j] = byte; + } +} + + + + + diff --git a/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.h b/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.h new file mode 100644 index 0000000000..49c24eef00 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/pack_unpack.h @@ -0,0 +1,52 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_params.h" +#include +#include + +extern void PQCLEAN_FIRESABER_AARCH64_asm_1_to_16(void *, const void *); +extern void PQCLEAN_FIRESABER_AARCH64_asm_4_to_16(void *, const void *); + +extern void PQCLEAN_FIRESABER_AARCH64_asm_10_to_32(void *, const void *); +extern void PQCLEAN_FIRESABER_AARCH64_asm_13_to_32(void *, const void *); +extern void PQCLEAN_FIRESABER_AARCH64_asm_16_to_32(void *, const void *); + +#define POLT2BS SABER_NAMESPACE(POLT2BS) +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +#define BS2POLT SABER_NAMESPACE(BS2POLT) +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]); + +#define POLq2BS SABER_NAMESPACE(POLq2BS) +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]); +#define POLp2BS SABER_NAMESPACE(POLp2BS) +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]); + +#define BS2POLq SABER_NAMESPACE(BS2POLq) +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]); +#define BS2POLp SABER_NAMESPACE(BS2POLp) +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]); + +#define POLVECq2BS SABER_NAMESPACE(POLVECq2BS) +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define POLVECp2BS SABER_NAMESPACE(POLVECp2BS) +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLVECq SABER_NAMESPACE(BS2POLVECq) +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define BS2POLVECp SABER_NAMESPACE(BS2POLVECp) +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLmsg SABER_NAMESPACE(BS2POLmsg) +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]); +#define POLmsg2BS SABER_NAMESPACE(POLmsg2BS) +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); + +#endif diff --git a/src/kem/saber/pqclean_firesaber_aarch64/verify.c b/src/kem/saber/pqclean_firesaber_aarch64/verify.c new file mode 100644 index 0000000000..87a7acc486 --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/verify.c @@ -0,0 +1,34 @@ +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "verify.h" +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +int verify(const unsigned char *a, const unsigned char *b, size_t len) { + uint64_t r; + size_t i; + r = 0; + + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; + } + + r = (-r) >> 63; + return r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/src/kem/saber/pqclean_firesaber_aarch64/verify.h b/src/kem/saber/pqclean_firesaber_aarch64/verify.h new file mode 100644 index 0000000000..2a3aabe77d --- /dev/null +++ b/src/kem/saber/pqclean_firesaber_aarch64/verify.h @@ -0,0 +1,21 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "SABER_params.h" +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +#define verify SABER_NAMESPACE(verify) +int verify(const unsigned char *a, const unsigned char *b, size_t len); + +/* b = 1 means mov, b = 0 means don't mov*/ +#define cmov SABER_NAMESPACE(cmov) +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/LICENSE b/src/kem/saber/pqclean_lightsaber_aarch64/LICENSE new file mode 100644 index 0000000000..0e259d42c9 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/NTT.h b/src/kem/saber/pqclean_lightsaber_aarch64/NTT.h new file mode 100644 index 0000000000..7df944aec6 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/NTT.h @@ -0,0 +1,50 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "NTT_params.h" + +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants, const uint32_t *_inv_twist_const); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul(uint32_t *src1, const uint32_t *src2, const uint32_t *src2_asymmetric, const uint32_t *_constants); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended(uint32_t *des, const uint32_t *src1, const uint32_t *src2_extended, const uint32_t *_constants); + +#define NTT(in) { \ + PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + } + +#define NTT_heavy(in_asymmetric, in) { \ + NTT(in); \ + PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended(in_asymmetric, in, pre_asymmetric_table_Q1_extended, constants); \ + } + +#define iNTT(in) { \ + PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ + PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants, inv_twist_table_all_Q1_extended); \ + } + +static const uint32_t constants[16] = { + Q1, Q1prime2 +}; + +static const uint32_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, -119635792, -1424544, 1027317558, 12232619, -496739340, -5914844, -253524894, -3018807, 9103545, 108399, 42771771, 509298, 283911363, 3380629, 0, 0, -66089826, -786954, -259955382, -3095377, -643539471, -7662843, -332278086, -3956548, 703146656, 8372606, -881793531, -10499815, 304160806, 3621746, 0, 0, 34506365, 410879, 663387313, 7899178, -615166382, -7324995, 242706356, 2889987, -1016509854, -12103928, -410776309, -4891253, -1039822114, -12381515, 0, 0, 770061100, 9169379, 176271869, 2098929, 377015451, 4489251, -777437559, -9257213, 185186875, 2205083, -476967921, -5679419, 111859832, 1331953, 0, 0, 267484771, 3185032, -241571930, -2876479, -116066229, -1382040, 605105697, 7205199, 246868243, 2939544, -801225576, -9540465, -29401110, -350089, 0, 0, 461101573, 5490493, -659878385, -7857396, -813049292, -9681254, -610503208, -7269469, 754028719, 8978476, -513464823, -6114000, 974898460, 11608447, 0, 0, -65601052, -781134, 122588677, 1459705, 406381289, 4838920, -584016855, -6954087, 1066347183, 12697358, -347834458, -4141783, -592155281, -7050994, 0, 0, 242486240, 2887366, 1001287142, 11922666, 375772353, 4474449, 752256115, 8957369, 322396534, 3838885, 525597088, 6258463, -971930207, -11573103, 0, 0, -983711428, -11713386, 6721989, 80041, -138847220, -1653301, 687033653, 8180743, -438460075, -5220893, 714691721, 8510077, -689918177, -8215090, 0, 0 +}; + +static const uint32_t pre_asymmetric_table_Q1_extended[NTT_N << 3] = { + -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, -29401110, -350089, -29401110, -350089, -29401110, -350089, -29401110, -350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090 + }; + +static const uint32_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, -283911363, -3380629, 983711428, 11713386, -242486240, -2887366, 138847220, 1653301, -375772353, -4474449, -6721989, -80041, -1001287142, -11922666, 0, 0, 496739340, 5914844, -283911363, -3380629, -42771771, -509298, 983711428, 11713386, 65601052, 781134, -242486240, -2887366, -461101573, -5490493, 0, 0, -9103545, -108399, -267484771, -3185032, -770061100, -9169379, 116066229, 1382040, -377015451, -4489251, 241571930, 2876479, -176271869, -2098929, 0, 0, 119635792, 1424544, 496739340, 5914844, -1027317558, -12232619, -283911363, -3380629, -9103545, -108399, -42771771, -509298, 253524894, 3018807, 0, 0, -42771771, -509298, 65601052, 781134, -461101573, -5490493, -406381289, -4838920, 813049292, 9681254, -122588677, -1459705, 659878385, 7857396, 0, 0, -1027317558, -12232619, -9103545, -108399, 253524894, 3018807, -267484771, -3185032, -34506365, -410879, -770061100, -9169379, 66089826, 786954, 0, 0, 253524894, 3018807, -34506365, -410879, 66089826, 786954, 615166382, 7324995, 643539471, 7662843, -663387313, -7899178, 259955382, 3095377, 0, 0 +}; + +static const uint32_t inv_twist_table_all_Q1_extended[ARRAY_N << 1] = { + -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, 48233192, 574329, 48233192, 574329, 48233192, 574329, 48233192, 574329, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, -38469886, -458074, -38469886, -458074, -38469886, -458074, -38469886, -458074, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, -29941449, -356523, -29941449, -356523, -29941449, -356523, -29941449, -356523, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, -7172803, -85409, -7172803, -85409, -7172803, -85409, -7172803, -85409, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, -2971193, -35379, -2971193, -35379, -2971193, -35379, -2971193, -35379, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -39398809, -469135, -39398809, -469135, -39398809, -469135, -39398809, -469135, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742 + }; + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/NTT_params.h b/src/kem/saber/pqclean_lightsaber_aarch64/NTT_params.h new file mode 100644 index 0000000000..25624db6c9 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/NTT_params.h @@ -0,0 +1,32 @@ +#ifndef NTT_PARAMS_H +#define NTT_PARAMS_H + +#define ARRAY_N 256 + +#define NTT_N 64 +#define LOGNTT_N 6 + +// Q1 +#define Q1 25570817 +// omegaQ1 = 3^( (Q1 - 1) / (NTT_N << 1) ) mod Q1 +#define omegaQ1 21614269 +// invomegaQ1 = omegaQ^{-1} mod Q1 +#define invomegaQ1 8215090 +// R = 2^32 below +// RmodQ1 = 2^32 mod^{+-} Q1 +#define RmodQ1 (-929960) +// Q1prime = Q1^{-1} mod^{+-} 2^32 +#define Q1prime (-155332095) +// invNQ1 = NTT_N^{-1} mod Q1 +#define invNQ1 25171273 +// R2modQ1 = 2^32 mod^{+-} Q1 +#define R2modQ1 (-929960) +// Q1prime2 = -Q1^{-1} mod^{+-} 2^32 +#define Q1prime2 155332095 + +#endif + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.c b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.c new file mode 100644 index 0000000000..f4df4832de --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.c @@ -0,0 +1,196 @@ +#include "NTT.h" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "cbd.h" +#include "fips202.h" +#include "fips202x2.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include + +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) + +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_round(uint16_t des[SABER_N], uint32_t src[SABER_N]); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg(uint16_t cipher[SABER_N], uint32_t src[SABER_N], uint16_t msg[SABER_N], int const_h1); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg(uint16_t msg[SABER_N], uint32_t src[SABER_N], uint16_t cipher[SABER_N], int const_h2); + +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; + + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_s, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32(&(A_NTT[j][i][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(s[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(s[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_round(b[i], A_NTT[i][0]); + } + + POLVECq2BS(sk, s); + POLVECp2BS(pk, b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); +} + +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint32_t b_NTT[SABER_L][SABER_N]; + + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_sp, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32(&(A_NTT[i][j][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(sp[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(sp[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_round(bp[i], A_NTT[i][0]); + } + + + BS2POLVECp(pk, b); + BS2POLmsg(m, mp); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32(&(b_NTT[i][0]), &(b[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg(vp, b_NTT[0], mp, h1); + + POLVECp2BS(ciphertext, bp); + POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); + + +} + +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]) { + + uint32_t b_NTT[SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + + BS2POLT(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, cm); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32(&(s_NTT[i][0]), sk + i * SABER_POLYBYTES); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32(&(b_NTT[i][0]), ciphertext + i * (SABER_EP * SABER_N / 8)); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg(v, b_NTT[0], cm, h2); + + POLmsg2BS(m, v); +} + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.h b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.h new file mode 100644 index 0000000000..0b74c2fca0 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_indcpa.h @@ -0,0 +1,14 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include "SABER_params.h" +#include + +#define indcpa_kem_keypair SABER_NAMESPACE(indcpa_kem_keypair) +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); +#define indcpa_kem_enc SABER_NAMESPACE(indcpa_kem_enc) +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]); +#define indcpa_kem_dec SABER_NAMESPACE(indcpa_kem_dec) +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/SABER_params.h b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_params.h new file mode 100644 index 0000000000..96a08121de --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/SABER_params.h @@ -0,0 +1,48 @@ +#ifndef PARAMS_H +#define PARAMS_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#define SABER_NAMESPACE(s) PQCLEAN_LIGHTSABER_AARCH64_##s +#define SABER_L 2 + +/* Don't change anything below this line */ +#define SABER_MU 10 +#define SABER_ET 3 + +#define SABER_EQ 13 +#define SABER_EP 10 +#define SABER_N 256 + +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 + +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) + +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) + +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/__asm_NTT.S b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_NTT.S new file mode 100644 index 0000000000..eeef00da57 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_NTT.S @@ -0,0 +1,309 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top: +_PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + src8 .req x8 + src9 .req x9 + src10 .req x10 + src11 .req x11 + src12 .req x12 + src13 .req x13 + src14 .req x14 + src15 .req x15 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add src1, src0, #64 + add src2, src0, #128 + + add src3, src0, #192 + add src4, src0, #256 + + add src5, src0, #320 + add src6, src0, #384 + + add src7, src0, #448 + add src8, src0, #512 + + add src9, src0, #576 + add src10, src0, #640 + + add src11, src0, #704 + add src12, src0, #768 + + add src13, src0, #832 + add src14, src0, #896 + + add src15, src0, #960 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [ src0] + ld1 { v2.4S}, [ src2] + ld1 { v4.4S}, [ src4] + ld1 { v6.4S}, [ src6] + ld1 { v8.4S}, [ src8] + ld1 {v10.4S}, [src10] + ld1 {v12.4S}, [src12] + ld1 {v14.4S}, [src14] + + ld1 { v1.4S}, [ src1] + ld1 { v3.4S}, [ src3] + ld1 { v5.4S}, [ src5] + ld1 { v7.4S}, [ src7] + ld1 { v9.4S}, [ src9] + ld1 {v11.4S}, [src11] + ld1 {v13.4S}, [src13] + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _ntt_top_loop: + + st1 { v0.4S}, [ src0], #16 + ld1 { v0.4S}, [ src0] + st1 { v2.4S}, [ src2], #16 + ld1 { v2.4S}, [ src2] + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v8.4S}, [ src8], #16 + ld1 { v8.4S}, [ src8] + st1 {v10.4S}, [src10], #16 + ld1 {v10.4S}, [src10] + st1 {v12.4S}, [src12], #16 + ld1 {v12.4S}, [src12] + st1 {v14.4S}, [src14], #16 + ld1 {v14.4S}, [src14] + + st1 { v1.4S}, [ src1], #16 + ld1 { v1.4S}, [ src1] + st1 { v3.4S}, [ src3], #16 + ld1 { v3.4S}, [ src3] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + st1 { v9.4S}, [ src9], #16 + ld1 { v9.4S}, [ src9] + st1 {v11.4S}, [src11], #16 + ld1 {v11.4S}, [src11] + st1 {v13.4S}, [src13], #16 + ld1 {v13.4S}, [src13] + st1 {v15.4S}, [src15], #16 + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_top_loop + + st1 { v0.4S}, [ src0], #16 + st1 { v2.4S}, [ src2], #16 + st1 { v4.4S}, [ src4], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v8.4S}, [ src8], #16 + st1 {v10.4S}, [src10], #16 + st1 {v12.4S}, [src12], #16 + st1 {v14.4S}, [src14], #16 + + st1 { v1.4S}, [ src1], #16 + st1 { v3.4S}, [ src3], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v7.4S}, [ src7], #16 + st1 { v9.4S}, [ src9], #16 + st1 {v11.4S}, [src11], #16 + st1 {v13.4S}, [src13], #16 + st1 {v15.4S}, [src15], #16 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq src8 + .unreq src9 + .unreq src10 + .unreq src11 + .unreq src12 + .unreq src13 + .unreq src14 + .unreq src15 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot: +_PQCLEAN_LIGHTSABER_AARCH64_asm_ntt_SIMD_bot: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + table0 .req x27 + table1 .req x28 + counter .req x19 + + ldr Q, [x2] + + add table0, x1, #64 + add table1, x1, #320 + + add src1, src0, #0 + add src2, src0, #512 + add src3, src0, #512 + + ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 + ld1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src2], #64 + ld1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src0], #64 + ld1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src2], #64 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + mov counter, #3 + _ntt_bot_loop: + + st1 { v0.4S}, [src1], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [src1], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [src1], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [src1], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [src1], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [src1], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [src1], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [src1], #16 + ld1 { v7.4S}, [src0], #16 + st1 { v8.4S}, [src3], #16 + ld1 { v8.4S}, [src2], #16 + st1 { v9.4S}, [src3], #16 + ld1 { v9.4S}, [src2], #16 + st1 {v10.4S}, [src3], #16 + ld1 {v10.4S}, [src2], #16 + st1 {v11.4S}, [src3], #16 + ld1 {v11.4S}, [src2], #16 + st1 {v12.4S}, [src3], #16 + ld1 {v12.4S}, [src2], #16 + st1 {v13.4S}, [src3], #16 + ld1 {v13.4S}, [src2], #16 + st1 {v14.4S}, [src3], #16 + ld1 {v14.4S}, [src2], #16 + st1 {v15.4S}, [src3], #16 + ld1 {v15.4S}, [src2], #16 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_bot_loop + + st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src1], #64 + st1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src3], #64 + st1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src1], #64 + st1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src3], #64 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq table0 + .unreq table1 + pop_all + + br lr + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/__asm_iNTT.S b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_iNTT.S new file mode 100644 index 0000000000..f33a186419 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_iNTT.S @@ -0,0 +1,472 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top: +_PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + des0 .req x1 + src1 .req x2 + des1 .req x3 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add des0, src0, #0 + add src1, src0, #512 + add des1, src0, #512 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [src0], #16 + ld1 { v1.4S}, [src0], #16 + ld1 { v2.4S}, [src0], #16 + ld1 { v3.4S}, [src0], #16 + ld1 { v4.4S}, [src0], #16 + ld1 { v5.4S}, [src0], #16 + ld1 { v6.4S}, [src0], #16 + ld1 { v7.4S}, [src0], #16 + + ld1 { v8.4S}, [src1], #16 + ld1 { v9.4S}, [src1], #16 + ld1 {v10.4S}, [src1], #16 + ld1 {v11.4S}, [src1], #16 + ld1 {v12.4S}, [src1], #16 + ld1 {v13.4S}, [src1], #16 + ld1 {v14.4S}, [src1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _intt_top_loop: + + st1 { v0.4S}, [des0], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [des0], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [des0], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [des0], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [des0], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [des0], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [des0], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [des0], #16 + ld1 { v7.4S}, [src0], #16 + + st1 { v8.4S}, [des1], #16 + ld1 { v8.4S}, [src1], #16 + st1 { v9.4S}, [des1], #16 + ld1 { v9.4S}, [src1], #16 + st1 {v10.4S}, [des1], #16 + ld1 {v10.4S}, [src1], #16 + st1 {v11.4S}, [des1], #16 + ld1 {v11.4S}, [src1], #16 + st1 {v12.4S}, [des1], #16 + ld1 {v12.4S}, [src1], #16 + st1 {v13.4S}, [des1], #16 + ld1 {v13.4S}, [src1], #16 + st1 {v14.4S}, [des1], #16 + ld1 {v14.4S}, [src1], #16 + st1 {v15.4S}, [des1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + + st1 { v0.4S}, [des0], #16 + st1 { v1.4S}, [des0], #16 + st1 { v2.4S}, [des0], #16 + st1 { v3.4S}, [des0], #16 + st1 { v4.4S}, [des0], #16 + st1 { v5.4S}, [des0], #16 + st1 { v6.4S}, [des0], #16 + st1 { v7.4S}, [des0], #16 + + st1 { v8.4S}, [des1], #16 + st1 { v9.4S}, [des1], #16 + st1 {v10.4S}, [des1], #16 + st1 {v11.4S}, [des1], #16 + st1 {v12.4S}, [des1], #16 + st1 {v13.4S}, [des1], #16 + st1 {v14.4S}, [des1], #16 + st1 {v15.4S}, [des1], #16 + + .unreq Q + .unreq src0 + .unreq des0 + .unreq src1 + .unreq des1 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot: +_PQCLEAN_LIGHTSABER_AARCH64_asm_intt_SIMD_bot: + + push_all + Q .req w20 + Qhalf .req w21 + nQhalf .req w22 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + table .req x28 + twistT0 .req x8 + twistT1 .req x9 + twistT2 .req x10 + twistT3 .req x11 + twistT4 .req x12 + twistT5 .req x13 + twistT6 .req x14 + twistT7 .req x15 + counter .req x19 + + add twistT0, x3, #256*0 + add twistT1, x3, #256*1 + add twistT2, x3, #256*2 + add twistT3, x3, #256*3 + add twistT4, x3, #256*4 + add twistT5, x3, #256*5 + add twistT6, x3, #256*6 + add twistT7, x3, #256*7 + + ldr Q, [x2] + lsr Qhalf, Q, #1 + neg nQhalf, Qhalf + + add table, x1, #64 + + add src1, src0, #128 + add src2, src0, #256 + add src3, src0, #384 + add src4, src0, #512 + add src5, src0, #640 + add src6, src0, #768 + add src7, src0, #896 + + ld1 { v0.4S}, [ src0] + ld1 { v1.4S}, [ src1] + ld1 { v2.4S}, [ src2] + ld1 { v3.4S}, [ src3] + ld1 { v4.4S}, [ src4] + ld1 { v5.4S}, [ src5] + ld1 { v6.4S}, [ src6] + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + mov counter, #7 + _intt_bot_loop: + + cmge v22.4S, v26.4S, v6.4S + ld1 { v0.4S}, [ src0] + cmge v23.4S, v26.4S, v7.4S + ld1 { v1.4S}, [ src1] + cmgt v20.4S, v6.4S, v25.4S + ld1 { v2.4S}, [ src2] + cmgt v21.4S, v7.4S, v25.4S + ld1 { v3.4S}, [ src3] + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + sub counter, counter, #1 + cbnz counter, _intt_bot_loop + + cmge v22.4S, v26.4S, v6.4S + cmge v23.4S, v26.4S, v7.4S + cmgt v20.4S, v6.4S, v25.4S + cmgt v21.4S, v7.4S, v25.4S + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v7.4S}, [ src7], #16 + + .unreq Q + .unreq Qhalf + .unreq nQhalf + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq table + .unreq twistT0 + .unreq twistT1 + .unreq twistT2 + .unreq twistT3 + .unreq twistT4 + .unreq twistT5 + .unreq twistT6 + .unreq twistT7 + .unreq counter + pop_all + + br lr + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/__asm_mul.S b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_mul.S new file mode 100644 index 0000000000..9be57de7c1 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_mul.S @@ -0,0 +1,255 @@ + +#include "macros.inc" +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul: +_PQCLEAN_LIGHTSABER_AARCH64_asm_asymmetric_mul: + + push_all + + ldr w28, [x3, #0] + ldr w27, [x3, #4] + + dup v28.4S, w28 + dup v29.4S, w27 + + add x11, x0, #0 + + add x4, x0, #1024 + add x5, x1, #1024 + add x6, x2, #1024 + +.if SABER_L > 2 + add x8, x0, #2048 + add x9, x1, #2048 + add x10, x2, #2048 +.endif + +.if SABER_L > 3 + add x12, x0, #3072 + add x13, x1, #3072 + add x14, x2, #3072 +.endif + + mov x16, #16 + _asymmetric_loop: + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x0], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x1], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x2], #64 + + _4x4_asymmetric smull, smull2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x4], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x5], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x6], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + +.if SABER_L > 2 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x8], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x9], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x10], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + +.if SABER_L > 3 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [x12], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [x13], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x14], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + + qq_montgomery v24, v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v0, v1, v2, v3, v29, v28 + + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x11], #64 + + sub x16, x16, #1 + cbnz x16, _asymmetric_loop + + pop_all + + br lr + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended: +_PQCLEAN_LIGHTSABER_AARCH64_asm_point_mul_extended: + + push_all + + ldr w20, [x3] + + ld1 { v0.4S}, [x1], #16 + ld1 { v1.4S}, [x1], #16 + ld1 { v2.4S}, [x1], #16 + ld1 { v3.4S}, [x1], #16 + + ld2 { v4.4S, v5.4S}, [x2], #32 + ld2 { v6.4S, v7.4S}, [x2], #32 + ld2 { v8.4S, v9.4S}, [x2], #32 + ld2 {v10.4S, v11.4S}, [x2], #32 + + sqrdmulh v12.4S, v0.4S, v4.4S + sqrdmulh v13.4S, v1.4S, v6.4S + sqrdmulh v14.4S, v2.4S, v8.4S + sqrdmulh v15.4S, v3.4S, v10.4S + + mov x16, #7 + _point_mul_loop: + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + ld1 { v0.4S}, [x1], #16 + mul v17.4S, v17.4S, v23.4S + ld1 { v1.4S}, [x1], #16 + mul v18.4S, v18.4S, v25.4S + ld1 { v2.4S}, [x1], #16 + mul v19.4S, v19.4S, v27.4S + ld1 { v3.4S}, [x1], #16 + + mls v16.4S, v28.4S, v20.4S + ld2 { v4.4S, v5.4S}, [x2], #32 + mls v17.4S, v29.4S, v20.4S + ld2 { v6.4S, v7.4S}, [x2], #32 + mls v18.4S, v30.4S, v20.4S + ld2 { v8.4S, v9.4S}, [x2], #32 + mls v19.4S, v31.4S, v20.4S + ld2 {v10.4S, v11.4S}, [x2], #32 + + st1 {v16.4S}, [x0], #16 + sqrdmulh v12.4S, v0.4S, v4.4S + st1 {v17.4S}, [x0], #16 + sqrdmulh v13.4S, v1.4S, v6.4S + st1 {v18.4S}, [x0], #16 + sqrdmulh v14.4S, v2.4S, v8.4S + st1 {v19.4S}, [x0], #16 + sqrdmulh v15.4S, v3.4S, v10.4S + + sub x16, x16, #1 + cbnz x16, _point_mul_loop + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + mul v17.4S, v17.4S, v23.4S + mul v18.4S, v18.4S, v25.4S + mul v19.4S, v19.4S, v27.4S + + mls v16.4S, v28.4S, v20.4S + mls v17.4S, v29.4S, v20.4S + mls v18.4S, v30.4S, v20.4S + mls v19.4S, v31.4S, v20.4S + + st1 {v16.4S}, [x0], #16 + st1 {v17.4S}, [x0], #16 + st1 {v18.4S}, [x0], #16 + st1 {v19.4S}, [x0], #16 + + pop_all + + br lr + + + + + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/__asm_narrow.S b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_narrow.S new file mode 100644 index 0000000000..e44c53944c --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_narrow.S @@ -0,0 +1,247 @@ + +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_round +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_round +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_round, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_round: +_PQCLEAN_LIGHTSABER_AARCH64_asm_round: + + + .equ srv, (SABER_EQ-SABER_EP) + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + mov x7, #7 + _round_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _round_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg: +_PQCLEAN_LIGHTSABER_AARCH64_asm_enc_add_msg: + + .equ srv, (SABER_EP-SABER_ET) + .equ slv, (SABER_EP-1) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _enc_add_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _enc_add_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg: +_PQCLEAN_LIGHTSABER_AARCH64_asm_dec_get_msg: + + .equ srv, (SABER_EP-1) + .equ slv, (SABER_EP-SABER_ET) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _dec_get_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _dec_get_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/__asm_pack_unpack.S b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_pack_unpack.S new file mode 100644 index 0000000000..1178015c5f --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/__asm_pack_unpack.S @@ -0,0 +1,345 @@ + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16 +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16 +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16: +_PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16: + + mov x15, #8 + _1_to_16_outer_loop: + + ldr w2, [x1], #4 + + mov x11, #4 + _1_to_16_inner_loop: + + sbfx w3, w2, #0, #1 + strh w3, [x0], #2 + sbfx w4, w2, #1, #1 + strh w4, [x0], #2 + sbfx w5, w2, #2, #1 + strh w5, [x0], #2 + sbfx w6, w2, #3, #1 + strh w6, [x0], #2 + sbfx w7, w2, #4, #1 + strh w7, [x0], #2 + sbfx w8, w2, #5, #1 + strh w8, [x0], #2 + sbfx w9, w2, #6, #1 + strh w9, [x0], #2 + sbfx w10, w2, #7, #1 + strh w10, [x0], #2 + + lsr w2, w2, #8 + + sub x11, x11, #1 + cbnz x11, _1_to_16_inner_loop + + sub x15, x15, #1 + cbnz x15, _1_to_16_outer_loop + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16 +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16 +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16: +_PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16: + + mov x7, #32 + _4_to_16_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + strh w3, [x0], #2 + sbfx w4, w2, #4, #4 + strh w4, [x0], #2 + sbfx w5, w2, #8, #4 + strh w5, [x0], #2 + sbfx w6, w2, #12, #4 + strh w6, [x0], #2 + + sbfx w3, w2, #16, #4 + strh w3, [x0], #2 + sbfx w4, w2, #20, #4 + strh w4, [x0], #2 + sbfx w5, w2, #24, #4 + strh w5, [x0], #2 + sbfx w6, w2, #28, #4 + strh w6, [x0], #2 + + sub x7, x7, #1 + cbnz x7, _4_to_16_loop + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32 +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32 +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32: +_PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32: + + mov x7, #16 + _10_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + str w3, [x0], #4 + sbfx w4, w2, #10, #10 + str w4, [x0], #4 + sbfx w5, w2, #20, #10 + str w5, [x0], #4 + lsr w6, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #2 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #8, #10 + str w4, [x0], #4 + sbfx w5, w2, #18, #10 + str w5, [x0], #4 + lsr w6, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #4 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #6, #10 + str w4, [x0], #4 + sbfx w5, w2, #16, #10 + str w5, [x0], #4 + lsr w6, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #6 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #4, #10 + str w4, [x0], #4 + sbfx w5, w2, #14, #10 + str w5, [x0], #4 + lsr w6, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #8 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #2, #10 + str w4, [x0], #4 + sbfx w5, w2, #12, #10 + str w5, [x0], #4 + sbfx w6, w2, #22, #10 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _10_to_32_loop + + br lr + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32 +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32 +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32: +_PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32: + + mov x7, #8 + _13_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #13 + str w3, [x0], #4 + sbfx w4, w2, #13, #13 + str w4, [x0], #4 + lsr w5, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #7 + lsl w3, w3, #6 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #7, #13 + str w4, [x0], #4 + lsr w5, w2, #20 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #1 + lsl w3, w3, #12 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #1, #13 + str w4, [x0], #4 + sbfx w5, w2, #14, #13 + str w5, [x0], #4 + lsr w5, w2, #27 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #5 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #8, #13 + str w4, [x0], #4 + lsr w5, w2, #21 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #11 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #2, #13 + str w4, [x0], #4 + sbfx w5, w2, #15, #13 + str w5, [x0], #4 + lsr w5, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #9 + lsl w3, w3, #4 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #9, #13 + str w4, [x0], #4 + lsr w5, w2, #22 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #3 + lsl w3, w3, #10 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #3, #13 + str w4, [x0], #4 + sbfx w5, w2, #16, #13 + str w5, [x0], #4 + lsr w5, w2, #29 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + lsl w3, w3, #3 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #10, #13 + str w4, [x0], #4 + lsr w5, w2, #23 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #9 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #4, #13 + str w4, [x0], #4 + sbfx w5, w2, #17, #13 + str w5, [x0], #4 + lsr w5, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #11 + lsl w3, w3, #2 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #11, #13 + str w4, [x0], #4 + lsr w5, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #5 + lsl w3, w3, #8 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #5, #13 + str w4, [x0], #4 + sbfx w5, w2, #18, #13 + str w5, [x0], #4 + lsr w5, w2, #31 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #12 + lsl w3, w3, #1 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #12, #13 + str w4, [x0], #4 + lsr w5, w2, #25 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #7 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #6, #13 + str w4, [x0], #4 + sbfx w5, w2, #19, #13 + str w5, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _13_to_32_loop + + br lr + + +.align 2 +.global PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32 +.global _PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32 +#ifndef __clang__ +.type PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32, %function +#endif +PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32: +_PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32: + + mov x7, #128 + _sbfx_loop: + + ldr w4, [x1], #4 + sbfx w5, w4, #0, #13 + sbfx w6, w4, #16, #13 + str w5, [x0], #4 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _sbfx_loop + + br lr + + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/api.h b/src/kem/saber/pqclean_lightsaber_aarch64/api.h new file mode 100644 index 0000000000..7e36ce7f2b --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_LIGHTSABER_AARCH64_API_H +#define PQCLEAN_LIGHTSABER_AARCH64_API_H + + +#define PQCLEAN_LIGHTSABER_AARCH64_CRYPTO_ALGNAME "LightSaber" +#define PQCLEAN_LIGHTSABER_AARCH64_CRYPTO_BYTES 32 +#define PQCLEAN_LIGHTSABER_AARCH64_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_LIGHTSABER_AARCH64_CRYPTO_PUBLICKEYBYTES 672 +#define PQCLEAN_LIGHTSABER_AARCH64_CRYPTO_SECRETKEYBYTES 1568 + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_LIGHTSABER_AARCH64_API_H */ diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/cbd.c b/src/kem/saber/pqclean_lightsaber_aarch64/cbd.c new file mode 100644 index 0000000000..49c759372a --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/cbd.c @@ -0,0 +1,79 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. + * Copyright (c) 2020 by Cryptographic Engineering Research Group (CERG) + * ECE Department, George Mason University + * Fairfax, VA, U.S.A. + * Author: Duc Tri Nguyen +=============================================================================*/ + +#include "cbd.h" +#include + +#define vload4(c, ptr) c = vld4q_u8(ptr); +#define vstore4(ptr, c) vst4q_u16(ptr, c); + +// c = a & b +#define vand8(c, a, b) c = vandq_u8(a, b); + +// c = a >> n +#define vsr8(c, a, n) c = vshrq_n_u8(a, n); + +// c = a + b +#define vadd8(c, a, b) c = vaddq_u8(a, b); + +// low c = (uint16_t) (a - b) +#define vsubll8(c, a, b) c = vsubl_u8(a, b); + +// high c = (uint16_t) (a - b) +#define vsublh8(c, a, b) c = vsubl_high_u8(a, b); + + + + +static uint64_t load40_littleendian(const uint8_t x[5]) { + uint64_t r; + r = (uint64_t) x[0]; + r |= (uint64_t) x[1] << 8; + r |= (uint64_t) x[2] << 16; + r |= (uint64_t) x[3] << 24; + r |= (uint64_t) x[4] << 32; + return r; +} + +static +void cbd5(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + uint64_t t, d, a[4], b[4]; + int i, j; + + for (i = 0; i < SABER_N / 4; i++) { + t = load40_littleendian(buf); + d = 0; + for (j = 0; j < 5; j++) { + d += (t >> j) & 0x0842108421UL; + } + + a[0] = d & 0x1f; + b[0] = (d >> 5) & 0x1f; + a[1] = (d >> 10) & 0x1f; + b[1] = (d >> 15) & 0x1f; + a[2] = (d >> 20) & 0x1f; + b[2] = (d >> 25) & 0x1f; + a[3] = (d >> 30) & 0x1f; + b[3] = (d >> 35); + + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); + + buf += 5; + } +} + +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + cbd5(s, buf); +} diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/cbd.h b/src/kem/saber/pqclean_lightsaber_aarch64/cbd.h new file mode 100644 index 0000000000..6e5c360cb3 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/cbd.h @@ -0,0 +1,17 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + +#include "SABER_params.h" +#include + +#define cbd SABER_NAMESPACE(cbd) +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.c b/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.c new file mode 100644 index 0000000000..3924900e9e --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.c @@ -0,0 +1,646 @@ +#include "fips202x2.h" +#include +#include + + +#define NROUNDS 24 + +// Define NEON operation +// c = load(ptr) +#define vload(ptr) vld1q_u64(ptr); +// ptr <= c; +#define vstore(ptr, c) vst1q_u64(ptr, c); +// c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); +// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) +#define vROL(out, a, offset) \ + (out) = vshlq_n_u64(a, offset); \ + (out) = vsriq_n_u64(out, a, 64 - (offset)); +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR4(out, a, b, c, d, e) \ + (out) = veorq_u64(a, b); \ + (out) = veorq_u64(out, c); \ + (out) = veorq_u64(out, d); \ + (out) = veorq_u64(out, e); +// Not And c = ~a & b +// #define vbic(c, a, b) c = vbicq_u64(b, a); +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + (out) = vbicq_u64(c, b); \ + (out) = veorq_u64(out, a); +// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support +#define vrxor(c, a, b) c = vrax1q_u64(a, b); +// End Define + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/************************************************* +* Name: KeccakF1600_StatePermutex2 +* +* Description: The Keccak F1600 Permutation +* +* Arguments: - uint64_t *state: pointer to input/output Keccak state +**************************************************/ +static inline +void KeccakF1600_StatePermutex2(v128 state[25]) { + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (int round = 0; round < NROUNDS; round += 2) { + // prepareTheta + vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR4(BCe, Abe, Age, Ake, Ame, Ase); + vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Aba, Aba, Da); + vxor(Age, Age, De); + vROL(BCe, Age, 44); + vxor(Aki, Aki, Di); + vROL(BCi, Aki, 43); + vxor(Amo, Amo, Do); + vROL(BCo, Amo, 21); + vxor(Asu, Asu, Du); + vROL(BCu, Asu, 14); + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vxor(Abo, Abo, Do); + vROL(BCa, Abo, 28); + vxor(Agu, Agu, Du); + vROL(BCe, Agu, 20); + vxor(Aka, Aka, Da); + vROL(BCi, Aka, 3); + vxor(Ame, Ame, De); + vROL(BCo, Ame, 45); + vxor(Asi, Asi, Di); + vROL(BCu, Asi, 61); + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vxor(Abe, Abe, De); + vROL(BCa, Abe, 1); + vxor(Agi, Agi, Di); + vROL(BCe, Agi, 6); + vxor(Ako, Ako, Do); + vROL(BCi, Ako, 25); + vxor(Amu, Amu, Du); + vROL(BCo, Amu, 8); + vxor(Asa, Asa, Da); + vROL(BCu, Asa, 18); + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vxor(Abu, Abu, Du); + vROL(BCa, Abu, 27); + vxor(Aga, Aga, Da); + vROL(BCe, Aga, 36); + vxor(Ake, Ake, De); + vROL(BCi, Ake, 10); + vxor(Ami, Ami, Di); + vROL(BCo, Ami, 15); + vxor(Aso, Aso, Do); + vROL(BCu, Aso, 56); + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vxor(Abi, Abi, Di); + vROL(BCa, Abi, 62); + vxor(Ago, Ago, Do); + vROL(BCe, Ago, 55); + vxor(Aku, Aku, Du); + vROL(BCi, Aku, 39); + vxor(Ama, Ama, Da); + vROL(BCo, Ama, 41); + vxor(Ase, Ase, De); + vROL(BCu, Ase, 2); + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Eba, Eba, Da); + vxor(Ege, Ege, De); + vROL(BCe, Ege, 44); + vxor(Eki, Eki, Di); + vROL(BCi, Eki, 43); + vxor(Emo, Emo, Do); + vROL(BCo, Emo, 21); + vxor(Esu, Esu, Du); + vROL(BCu, Esu, 14); + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vxor(Ebo, Ebo, Do); + vROL(BCa, Ebo, 28); + vxor(Egu, Egu, Du); + vROL(BCe, Egu, 20); + vxor(Eka, Eka, Da); + vROL(BCi, Eka, 3); + vxor(Eme, Eme, De); + vROL(BCo, Eme, 45); + vxor(Esi, Esi, Di); + vROL(BCu, Esi, 61); + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vxor(Ebe, Ebe, De); + vROL(BCa, Ebe, 1); + vxor(Egi, Egi, Di); + vROL(BCe, Egi, 6); + vxor(Eko, Eko, Do); + vROL(BCi, Eko, 25); + vxor(Emu, Emu, Du); + vROL(BCo, Emu, 8); + vxor(Esa, Esa, Da); + vROL(BCu, Esa, 18); + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vxor(Ebu, Ebu, Du); + vROL(BCa, Ebu, 27); + vxor(Ega, Ega, Da); + vROL(BCe, Ega, 36); + vxor(Eke, Eke, De); + vROL(BCi, Eke, 10); + vxor(Emi, Emi, Di); + vROL(BCo, Emi, 15); + vxor(Eso, Eso, Do); + vROL(BCu, Eso, 56); + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vxor(Ebi, Ebi, Di); + vROL(BCa, Ebi, 62); + vxor(Ego, Ego, Do); + vROL(BCe, Ego, 55); + vxor(Eku, Eku, Du); + vROL(BCi, Eku, 39); + vxor(Ema, Ema, Da); + vROL(BCo, Ema, 41); + vxor(Ese, Ese, De); + vROL(BCu, Ese, 2); + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +/************************************************* +* Name: keccakx2_absorb +* +* Description: Absorb step of Keccak; +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - const uint8_t *m: pointer to input to be absorbed into s +* - size_t mlen: length of input in bytes +* - uint8_t p: domain-separation byte for different +* Keccak-derived functions +**************************************************/ +static +void keccakx2_absorb(v128 s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen, + uint8_t p) { + size_t i, pos = 0; + + // Declare SIMD registers + v128 tmp, mask; + uint64x1_t a, b; + uint64x2_t a1, b1, atmp1, btmp1; + uint64x2x2_t a2, b2, atmp2, btmp2; + // End + + for (i = 0; i < 25; ++i) { + s[i] = vdupq_n_u64(0); + } + + // Load in0[i] to register, then in1[i] to register, exchange them + while (inlen >= r) { + for (i = 0; i < r / 8 - 1; i += 4) { + a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); + b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); + atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); + // AC = zip2(AB and CD) + btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); + btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); + + vxor(s[i + 0], s[i + 0], atmp2.val[0]); + vxor(s[i + 1], s[i + 1], btmp2.val[0]); + vxor(s[i + 2], s[i + 2], atmp2.val[1]); + vxor(s[i + 3], s[i + 3], btmp2.val[1]); + + pos += 8 * 2 * 2; + } + // Last iteration + i = r / 8 - 1; + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + pos += 8; + + KeccakF1600_StatePermutex2(s); + inlen -= r; + } + + i = 0; + while (inlen >= 16) { + a1 = vld1q_u64((uint64_t *)&in0[pos]); + b1 = vld1q_u64((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp1 = vzip1q_u64(a1, b1); + // AC = zip2(AB and CD) + btmp1 = vzip2q_u64(a1, b1); + + vxor(s[i + 0], s[i + 0], atmp1); + vxor(s[i + 1], s[i + 1], btmp1); + + i += 2; + pos += 8 * 2; + inlen -= 8 * 2; + } + + if (inlen >= 8) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + + i++; + pos += 8; + inlen -= 8; + } + + if (inlen) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); + tmp = vandq_u64(tmp, mask); + vxor(s[i], s[i], tmp); + } + + tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); + vxor(s[i], s[i], tmp); + + mask = vdupq_n_u64(1ULL << 63); + vxor(s[r / 8 - 1], s[r / 8 - 1], mask); +} + +/************************************************* +* Name: keccak_squeezeblocks +* +* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. +* Modifies the state. Can be called multiple times to keep +* squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed (written to h) +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - uint64_t *s: pointer to input/output Keccak state +**************************************************/ +static +void keccakx2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + unsigned int r, + v128 s[25]) { + unsigned int i; + + uint64x1_t a, b; + uint64x2x2_t a2, b2; + + while (nblocks > 0) { + KeccakF1600_StatePermutex2(s); + + for (i = 0; i < r / 8 - 1; i += 4) { + a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); + b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); + a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); + b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); + vst1q_u64_x2((uint64_t *)out0, a2); + vst1q_u64_x2((uint64_t *)out1, b2); + + out0 += 32; + out1 += 32; + } + + i = r / 8 - 1; + // Last iteration + a = vget_low_u64(s[i]); + b = vget_high_u64(s[i]); + vst1_u64((uint64_t *)out0, a); + vst1_u64((uint64_t *)out1, b); + + out0 += 8; + out1 += 8; + + --nblocks; + } +} + +/************************************************* +* Name: shake128x2_absorb +* +* Description: Absorb step of the SHAKE128 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of +* SHAKE128_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); +} + +/************************************************* +* Name: shake256_absorb +* +* Description: Absorb step of the SHAKE256 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake256_squeezeblocks +* +* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of +* SHAKE256_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); +} + +/************************************************* +* Name: shake128 +* +* Description: SHAKE128 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[2][SHAKE128_RATE]; + keccakx2_state state; + + shake128x2_absorb(&state, in0, in1, inlen); + shake128x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} + +/************************************************* +* Name: shake256 +* +* Description: SHAKE256 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[2][SHAKE256_RATE]; + keccakx2_state state; + + shake256x2_absorb(&state, in0, in1, inlen); + shake256x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.h b/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.h new file mode 100644 index 0000000000..11579f3015 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/fips202x2.h @@ -0,0 +1,54 @@ +#ifndef FIPS202X2_H +#define FIPS202X2_H + +#include "SABER_params.h" +#include +#include +#include "fips202.h" +typedef uint64x2_t v128; + +typedef struct { + v128 s[25]; +} keccakx2_state; + + +#define shake128x2_absorb SABER_NAMESPACE(shake128x2_absorb) +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake128x2_squeezeblocks SABER_NAMESPACE(shake128x2_squeezeblocks) +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake256x2_absorb SABER_NAMESPACE(shake256x2_absorb) +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2_squeezeblocks SABER_NAMESPACE(shake256x2_squeezeblocks) +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake128x2 SABER_NAMESPACE(shake128x2) +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2 SABER_NAMESPACE(shake256x2) +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/kem.c b/src/kem/saber/pqclean_lightsaber_aarch64/kem.c new file mode 100644 index 0000000000..987ee8404b --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/kem.c @@ -0,0 +1,84 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "fips202.h" +#include "kem.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + int i; + + indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { + + unsigned char kr[64]; // Will contain key, coins + unsigned char buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + indcpa_kem_enc(buf, kr + 32, pk, c); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { + int i, fail; + unsigned char cmp[SABER_BYTES_CCA_DEC]; + unsigned char buf[64]; + unsigned char kr[64]; // Will contain key, coins + const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + indcpa_kem_dec(sk, c, buf); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + indcpa_kem_enc(buf, kr + 32, pk, cmp); + + fail = verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/kem.h b/src/kem/saber/pqclean_lightsaber_aarch64/kem.h new file mode 100644 index 0000000000..b11c99bdbb --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/kem.h @@ -0,0 +1,17 @@ +#ifndef KEM_H +#define KEM_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include + +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); +int PQCLEAN_LIGHTSABER_AARCH64_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/macros.inc b/src/kem/saber/pqclean_lightsaber_aarch64/macros.inc new file mode 100644 index 0000000000..88c3675f29 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/macros.inc @@ -0,0 +1,57 @@ + +#ifndef MACROS_S +#define MACROS_S + +#include "macros_common.inc" + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro wrap_4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3, dS, qS, dD + + \mulacc \l0\dD, \a0\dS, \b0\dS + \mulacc2 \h0\dD, \a0\qS, \b0\qS + \mulacc \l1\dD, \a0\dS, \b1\dS + \mulacc2 \h1\dD, \a0\qS, \b1\qS + \mulacc \l2\dD, \a0\dS, \b2\dS + \mulacc2 \h2\dD, \a0\qS, \b2\qS + \mulacc \l3\dD, \a0\dS, \b3\dS + \mulacc2 \h3\dD, \a0\qS, \b3\qS + +.endm + +.macro _4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_4x4_asymmetric \mulacc, \mulacc2, \a0, \b0, \b1, \b2, \b3, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .2S, .4S, .2D +.endm + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_add_sub \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm + +#endif + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/macros_common.inc b/src/kem/saber/pqclean_lightsaber_aarch64/macros_common.inc new file mode 100644 index 0000000000..26e7cbb5da --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/macros_common.inc @@ -0,0 +1,434 @@ + +#ifndef MACROS_COMMON +#define MACROS_COMMON + +// for ABI + +.macro push_all + + sub sp, sp, #(16*9) + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*6] + stp d12, d13, [sp, #16*7] + stp d14, d15, [sp, #16*8] + +.endm + +.macro pop_all + + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp d10, d11, [sp, #16*6] + ldp d12, d13, [sp, #16*7] + ldp d14, d15, [sp, #16*8] + add sp, sp, #(16*9) + +.endm + +// vector-scalar butterflies + +.macro wrap_dX_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + sub \b2\wX, \a2\wX, \t2\wX + sub \b3\wX, \a3\wX, \t3\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + add \a2\wX, \a2\wX, \t2\wX + add \a3\wX, \a3\wX, \t3\wX + +.endm + +.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b4\wX, \a4\wX, \t4\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b5\wX, \a5\wX, \t5\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b6\wX, \a6\wX, \t6\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + sub \b7\wX, \a7\wX, \t7\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a4\wX, \a4\wX, \t4\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a5\wX, \a5\wX, \t5\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a6\wX, \a6\wX, \t6\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + add \a7\wX, \a7\wX, \t7\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +// vector-vector butterflies + +.macro wrap_dX_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + mul \t1\wX, \b1\wX, \h1\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +// vector-scalar Barrett reduction + +.macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// vector-vector Barrett reduction + +.macro wrap_qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\wX + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\wX + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\wX + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// Montgomery multiplication + +.macro wrap_qX_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \b0\wX, \t0\wX, \z0\nX[\h0] + mul \b1\wX, \t1\wX, \z1\nX[\h1] + mul \b2\wX, \t2\wX, \z2\nX[\h2] + mul \b3\wX, \t3\wX, \z3\nX[\h3] + + sqrdmulh \t0\wX, \t0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \t1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \t2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \t3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +// Montgomery reduction with long + +.macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX + + uzp1 \t0\wX, \l0\wX, \h0\wX + uzp1 \t1\wX, \l1\wX, \h1\wX + uzp1 \t2\wX, \l2\wX, \h2\wX + uzp1 \t3\wX, \l3\wX, \h3\wX + + mul \t0\wX, \t0\wX, \Qprime\wX + mul \t1\wX, \t1\wX, \Qprime\wX + mul \t2\wX, \t2\wX, \Qprime\wX + mul \t3\wX, \t3\wX, \Qprime\wX + + smlal \l0\dwX, \t0\lX, \Q\lX + smlal2 \h0\dwX, \t0\wX, \Q\wX + smlal \l1\dwX, \t1\lX, \Q\lX + smlal2 \h1\dwX, \t1\wX, \Q\wX + smlal \l2\dwX, \t2\lX, \Q\lX + smlal2 \h2\dwX, \t2\wX, \Q\wX + smlal \l3\dwX, \t3\lX, \Q\lX + smlal2 \h3\dwX, \t3\wX, \Q\wX + + uzp2 \c0\wX, \l0\wX, \h0\wX + uzp2 \c1\wX, \l1\wX, \h1\wX + uzp2 \c2\wX, \l2\wX, \h2\wX + uzp2 \c3\wX, \l3\wX, \h3\wX + +.endm + +// add_sub, sub_add + +.macro wrap_qX_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + add \s0\wX, \a0\wX, \b0\wX + sub \t0\wX, \a0\wX, \b0\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s3\wX, \a3\wX, \b3\wX + sub \t3\wX, \a3\wX, \b3\wX + +.endm + +.macro wrap_qX_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + sub \t0\wX, \a0\wX, \b0\wX + add \s0\wX, \a0\wX, \b0\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t3\wX, \a3\wX, \b3\wX + add \s3\wX, \a3\wX, \b3\wX + +.endm + + +#endif + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.c b/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.c new file mode 100644 index 0000000000..32a1b3bd9f --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.c @@ -0,0 +1,195 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + + +#include "api.h" +#include "pack_unpack.h" +#include + +/* This function reduces its input mod T */ +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j; + const uint16_t *in = data; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 8; j++) { + out[0] = (uint8_t) ((in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6)); + out[1] = (uint8_t) (((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7)); + out[2] = (uint8_t) (((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5)); + in += 8; + out += 3; + } +} + +/* This function does NOT reduce its output mod T */ +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data; + for (j = 0; j < SABER_N / 8; j++) { + out[0] = in[0]; + out[1] = in[0] >> 3; + out[2] = (in[0] >> 6) | (in[1] << 2); + out[3] = in[1] >> 1; + out[4] = in[1] >> 4; + out[5] = (in[1] >> 7) | (in[2] << 1); + out[6] = in[2] >> 2; + out[7] = in[2] >> 5; + in += 3; + out += 8; + } +} + +/* This function reduces its input mod q */ +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 8; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); + in += 8; + out += 13; + } +} + +/* This function sign-extends its output from q-bit to 16-bit. +This is needed by 16-bit NTTs */ +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]) { + size_t i; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int13_t { // bitfield struct to sign-extend q-bit to 16-bit. +signed int bits: + SABER_EQ; + } q0, q1, q2, q3, q4, q5, q6, q7; + + for (i = 0; i < SABER_N / 8; i++) { + q0.bits = (in[0]) | (in[1] << 8); + q1.bits = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + q2.bits = (in[3] >> 2) | (in[4] << 6); + q3.bits = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + q4.bits = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + q5.bits = (in[8] >> 1) | (in[9] << 7); + q6.bits = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + q7.bits = (in[11] >> 3) | (in[12] << 5); + out[0] = (int16_t)q0.bits; + out[1] = (int16_t)q1.bits; + out[2] = (int16_t)q2.bits; + out[3] = (int16_t)q3.bits; + out[4] = (int16_t)q4.bits; + out[5] = (int16_t)q5.bits; + out[6] = (int16_t)q6.bits; + out[7] = (int16_t)q7.bits; + in += 13; + out += 8; + } +} + +/* This function reduces its input mod p */ +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 4; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); + in += 4; + out += 5; + } +} + +/* This function sign-extends its output from p-bit to 16-bit. +This is needed by the NTT */ +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]) { + size_t j; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int10_t { // bitfield struct to sign-extend p-bit to 16-bit. +signed int bits: + SABER_EP; + } p0, p1, p2, p3; + + for (j = 0; j < SABER_N / 4; j++) { + p0.bits = (in[0]) | (in[1] << 8); + p1.bits = (in[1] >> 2) | (in[2] << 6); + p2.bits = (in[2] >> 4) | (in[3] << 4); + p3.bits = (in[3] >> 6) | (in[4] << 2); + out[0] = (int16_t)p0.bits; + out[1] = (int16_t)p1.bits; + out[2] = (int16_t)p2.bits; + out[3] = (int16_t)p3.bits; + in += 5; + out += 4; + } +} + +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]) { + PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16(&(data[0]), &(bytes[0])); +} + +/* This function reduces its input mod 2 */ +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + uint8_t byte; + for (j = 0; j < SABER_KEYBYTES; j++) { + byte = 0; + for (i = 0; i < 8; i++) { + byte |= ((data[j * 8 + i] & 0x01) << i); + } + bytes[j] = byte; + } +} + + + + + diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.h b/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.h new file mode 100644 index 0000000000..4ca7db9ec3 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/pack_unpack.h @@ -0,0 +1,52 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_params.h" +#include +#include + +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_1_to_16(void *, const void *); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_4_to_16(void *, const void *); + +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_10_to_32(void *, const void *); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_13_to_32(void *, const void *); +extern void PQCLEAN_LIGHTSABER_AARCH64_asm_16_to_32(void *, const void *); + +#define POLT2BS SABER_NAMESPACE(POLT2BS) +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +#define BS2POLT SABER_NAMESPACE(BS2POLT) +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]); + +#define POLq2BS SABER_NAMESPACE(POLq2BS) +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]); +#define POLp2BS SABER_NAMESPACE(POLp2BS) +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]); + +#define BS2POLq SABER_NAMESPACE(BS2POLq) +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]); +#define BS2POLp SABER_NAMESPACE(BS2POLp) +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]); + +#define POLVECq2BS SABER_NAMESPACE(POLVECq2BS) +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define POLVECp2BS SABER_NAMESPACE(POLVECp2BS) +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLVECq SABER_NAMESPACE(BS2POLVECq) +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define BS2POLVECp SABER_NAMESPACE(BS2POLVECp) +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLmsg SABER_NAMESPACE(BS2POLmsg) +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]); +#define POLmsg2BS SABER_NAMESPACE(POLmsg2BS) +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); + +#endif diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/verify.c b/src/kem/saber/pqclean_lightsaber_aarch64/verify.c new file mode 100644 index 0000000000..87a7acc486 --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/verify.c @@ -0,0 +1,34 @@ +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "verify.h" +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +int verify(const unsigned char *a, const unsigned char *b, size_t len) { + uint64_t r; + size_t i; + r = 0; + + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; + } + + r = (-r) >> 63; + return r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/src/kem/saber/pqclean_lightsaber_aarch64/verify.h b/src/kem/saber/pqclean_lightsaber_aarch64/verify.h new file mode 100644 index 0000000000..2a3aabe77d --- /dev/null +++ b/src/kem/saber/pqclean_lightsaber_aarch64/verify.h @@ -0,0 +1,21 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "SABER_params.h" +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +#define verify SABER_NAMESPACE(verify) +int verify(const unsigned char *a, const unsigned char *b, size_t len); + +/* b = 1 means mov, b = 0 means don't mov*/ +#define cmov SABER_NAMESPACE(cmov) +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/LICENSE b/src/kem/saber/pqclean_saber_aarch64/LICENSE new file mode 100644 index 0000000000..0e259d42c9 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/src/kem/saber/pqclean_saber_aarch64/NTT.h b/src/kem/saber/pqclean_saber_aarch64/NTT.h new file mode 100644 index 0000000000..f835f22cec --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/NTT.h @@ -0,0 +1,50 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "NTT_params.h" + +extern void PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top(uint32_t *des, const uint32_t *table, const uint32_t *_constants); +extern void PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot(uint32_t *des, const uint32_t *table, const uint32_t *_constants, const uint32_t *_inv_twist_const); +extern void PQCLEAN_SABER_AARCH64_asm_asymmetric_mul(uint32_t *src1, const uint32_t *src2, const uint32_t *src2_asymmetric, const uint32_t *_constants); +extern void PQCLEAN_SABER_AARCH64_asm_point_mul_extended(uint32_t *des, const uint32_t *src1, const uint32_t *src2_extended, const uint32_t *_constants); + +#define NTT(in) { \ + PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot(in, streamlined_CT_negacyclic_table_Q1_extended, constants); \ + } + +#define NTT_heavy(in_asymmetric, in) { \ + NTT(in); \ + PQCLEAN_SABER_AARCH64_asm_point_mul_extended(in_asymmetric, in, pre_asymmetric_table_Q1_extended, constants); \ + } + +#define iNTT(in) { \ + PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top(in, streamlined_inv_CT_table_Q1_extended, constants); \ + PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot(in, streamlined_inv_CT_table_Q1_extended, constants, inv_twist_table_all_Q1_extended); \ + } + +static const uint32_t constants[16] = { + Q1, Q1prime2 +}; + +static const uint32_t streamlined_CT_negacyclic_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, -119635792, -1424544, 1027317558, 12232619, -496739340, -5914844, -253524894, -3018807, 9103545, 108399, 42771771, 509298, 283911363, 3380629, 0, 0, -66089826, -786954, -259955382, -3095377, -643539471, -7662843, -332278086, -3956548, 703146656, 8372606, -881793531, -10499815, 304160806, 3621746, 0, 0, 34506365, 410879, 663387313, 7899178, -615166382, -7324995, 242706356, 2889987, -1016509854, -12103928, -410776309, -4891253, -1039822114, -12381515, 0, 0, 770061100, 9169379, 176271869, 2098929, 377015451, 4489251, -777437559, -9257213, 185186875, 2205083, -476967921, -5679419, 111859832, 1331953, 0, 0, 267484771, 3185032, -241571930, -2876479, -116066229, -1382040, 605105697, 7205199, 246868243, 2939544, -801225576, -9540465, -29401110, -350089, 0, 0, 461101573, 5490493, -659878385, -7857396, -813049292, -9681254, -610503208, -7269469, 754028719, 8978476, -513464823, -6114000, 974898460, 11608447, 0, 0, -65601052, -781134, 122588677, 1459705, 406381289, 4838920, -584016855, -6954087, 1066347183, 12697358, -347834458, -4141783, -592155281, -7050994, 0, 0, 242486240, 2887366, 1001287142, 11922666, 375772353, 4474449, 752256115, 8957369, 322396534, 3838885, 525597088, 6258463, -971930207, -11573103, 0, 0, -983711428, -11713386, 6721989, 80041, -138847220, -1653301, 687033653, 8180743, -438460075, -5220893, 714691721, 8510077, -689918177, -8215090, 0, 0 +}; + +static const uint32_t pre_asymmetric_table_Q1_extended[NTT_N << 3] = { + -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, -332278086, -3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 332278086, 3956548, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, 703146656, 8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -703146656, -8372606, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, -881793531, -10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 881793531, 10499815, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, 304160806, 3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, -304160806, -3621746, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, 242706356, 2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -242706356, -2889987, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, -1016509854, -12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, 1016509854, 12103928, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, -410776309, -4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, 410776309, 4891253, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, -1039822114, -12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, 1039822114, 12381515, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, -777437559, -9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 777437559, 9257213, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, 185186875, 2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -185186875, -2205083, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, -476967921, -5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 476967921, 5679419, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, 111859832, 1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, -111859832, -1331953, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, 605105697, 7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, -605105697, -7205199, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, 246868243, 2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -246868243, -2939544, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, -801225576, -9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, 801225576, 9540465, -29401110, -350089, -29401110, -350089, -29401110, -350089, -29401110, -350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, 29401110, 350089, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, -610503208, -7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 610503208, 7269469, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, 754028719, 8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -754028719, -8978476, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, -513464823, -6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 513464823, 6114000, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, 974898460, 11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -974898460, -11608447, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, -584016855, -6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 584016855, 6954087, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, 1066347183, 12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -1066347183, -12697358, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, -347834458, -4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, 347834458, 4141783, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, -592155281, -7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 592155281, 7050994, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, 752256115, 8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, -752256115, -8957369, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, 322396534, 3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, -322396534, -3838885, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, 525597088, 6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -525597088, -6258463, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, -971930207, -11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 971930207, 11573103, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, 687033653, 8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -687033653, -8180743, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, -438460075, -5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 438460075, 5220893, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, 714691721, 8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -714691721, -8510077, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, -689918177, -8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090, 689918177, 8215090 + }; + +static const uint32_t streamlined_inv_CT_table_Q1_extended[(NTT_N + (1 << 0) + (1 << 3)) << 1] = { + 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, 84, 1, 84, 1, 119635792, 1424544, 84, 1, 496739340, 5914844, 119635792, 1424544, -1027317558, -12232619, 0, 0, -283911363, -3380629, 983711428, 11713386, -242486240, -2887366, 138847220, 1653301, -375772353, -4474449, -6721989, -80041, -1001287142, -11922666, 0, 0, 496739340, 5914844, -283911363, -3380629, -42771771, -509298, 983711428, 11713386, 65601052, 781134, -242486240, -2887366, -461101573, -5490493, 0, 0, -9103545, -108399, -267484771, -3185032, -770061100, -9169379, 116066229, 1382040, -377015451, -4489251, 241571930, 2876479, -176271869, -2098929, 0, 0, 119635792, 1424544, 496739340, 5914844, -1027317558, -12232619, -283911363, -3380629, -9103545, -108399, -42771771, -509298, 253524894, 3018807, 0, 0, -42771771, -509298, 65601052, 781134, -461101573, -5490493, -406381289, -4838920, 813049292, 9681254, -122588677, -1459705, 659878385, 7857396, 0, 0, -1027317558, -12232619, -9103545, -108399, 253524894, 3018807, -267484771, -3185032, -34506365, -410879, -770061100, -9169379, 66089826, 786954, 0, 0, 253524894, 3018807, -34506365, -410879, 66089826, 786954, 615166382, 7324995, 643539471, 7662843, -663387313, -7899178, 259955382, 3095377, 0, 0 +}; + +static const uint32_t inv_twist_table_all_Q1_extended[ARRAY_N << 1] = { + -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, -806526676, -9603587, 48233192, 574329, 48233192, 574329, 48233192, 574329, 48233192, 574329, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -781310380, -9303328, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, -672564090, -8008449, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, 246168339, 2931210, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -1029960130, -12264085, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, -740184653, -8813630, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, 161300767, 1920663, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -174979977, -2083546, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -95582308, -1138131, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, -605914106, -7214825, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, 553452597, 6590148, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, -224497251, -2673165, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 276485019, 3292201, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, 953978590, 11359347, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, -411604874, -4901119, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 833204424, 9921248, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, 753488464, 8972043, -38469886, -458074, -38469886, -458074, -38469886, -458074, -38469886, -458074, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, 852175664, 10147145, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -278415257, -3315185, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, -1014095461, -12075179, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, 307793104, 3664997, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, -130967039, -1559469, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 478387802, 5696326, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 692860396, 8250124, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 803792144, 9571026, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 352456397, 4196818, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, 230047357, 2739252, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, -1026754544, -12225915, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, 992128925, 11813616, -29941449, -356523, -29941449, -356523, -29941449, -356523, -29941449, -356523, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -1068560020, -12723707, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -581973493, -6929756, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, -304246804, -3622770, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, 542646572, 6461477, -7172803, -85409, -7172803, -85409, -7172803, -85409, -7172803, -85409, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -417737898, -4974147, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -397539264, -4733635, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, -711017600, -8466328, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, 340918639, 4059434, -2971193, -35379, -2971193, -35379, -2971193, -35379, -2971193, -35379, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -316030964, -3763088, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -980706054, -11677600, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -799784280, -9523303, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, -606599985, -7222992, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, 988795687, 11773926, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, -318379767, -3791056, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 675788404, 8046842, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, 719075991, 8562282, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -410606666, -4889233, -39398809, -469135, -39398809, -469135, -39398809, -469135, -39398809, -469135, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -323375678, -3850544, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, -616711312, -7343391, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 197741568, 2354576, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, 775336082, 9232190, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, -135399935, -1612253, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, 865050664, 10300452, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -1004611982, -11962256, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, -621203079, -7396876, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 135583351, 1614437, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, 530210041, 6313391, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, -695736773, -8284374, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742, 408717831, 4866742 + }; + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/NTT_params.h b/src/kem/saber/pqclean_saber_aarch64/NTT_params.h new file mode 100644 index 0000000000..25624db6c9 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/NTT_params.h @@ -0,0 +1,32 @@ +#ifndef NTT_PARAMS_H +#define NTT_PARAMS_H + +#define ARRAY_N 256 + +#define NTT_N 64 +#define LOGNTT_N 6 + +// Q1 +#define Q1 25570817 +// omegaQ1 = 3^( (Q1 - 1) / (NTT_N << 1) ) mod Q1 +#define omegaQ1 21614269 +// invomegaQ1 = omegaQ^{-1} mod Q1 +#define invomegaQ1 8215090 +// R = 2^32 below +// RmodQ1 = 2^32 mod^{+-} Q1 +#define RmodQ1 (-929960) +// Q1prime = Q1^{-1} mod^{+-} 2^32 +#define Q1prime (-155332095) +// invNQ1 = NTT_N^{-1} mod Q1 +#define invNQ1 25171273 +// R2modQ1 = 2^32 mod^{+-} Q1 +#define R2modQ1 (-929960) +// Q1prime2 = -Q1^{-1} mod^{+-} 2^32 +#define Q1prime2 155332095 + +#endif + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.c b/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.c new file mode 100644 index 0000000000..2ac5b2e43b --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.c @@ -0,0 +1,196 @@ +#include "NTT.h" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "cbd.h" +#include "fips202.h" +#include "fips202x2.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include + +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) + +extern void PQCLEAN_SABER_AARCH64_asm_round(uint16_t des[SABER_N], uint32_t src[SABER_N]); +extern void PQCLEAN_SABER_AARCH64_asm_enc_add_msg(uint16_t cipher[SABER_N], uint32_t src[SABER_N], uint16_t msg[SABER_N], int const_h1); +extern void PQCLEAN_SABER_AARCH64_asm_dec_get_msg(uint16_t msg[SABER_N], uint32_t src[SABER_N], uint16_t cipher[SABER_N], int const_h2); + +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; + + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_s, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_SABER_AARCH64_asm_13_to_32(&(A_NTT[j][i][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(s[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_SABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(s[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_round(b[i], A_NTT[i][0]); + } + + POLVECq2BS(sk, s); + POLVECp2BS(pk, b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); +} + +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t A_NTT[SABER_L][SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint32_t b_NTT[SABER_L][SABER_N]; + + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + + uint8_t shake_A_buf[SABER_L * SABER_L * SABER_POLYBYTES]; + uint8_t shake_s_buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(shake_A_buf, sizeof(shake_A_buf), seed_A, SABER_SEEDBYTES); + shake128(shake_s_buf, sizeof(shake_s_buf), seed_sp, SABER_NOISE_SEEDBYTES); + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + PQCLEAN_SABER_AARCH64_asm_13_to_32(&(A_NTT[i][j][0]), shake_A_buf + (i * SABER_L + j) * SABER_POLYBYTES); + } + } + + for (int i = 0; i < SABER_L; i++) { + cbd(sp[i], shake_s_buf + i * SABER_POLYCOINBYTES); + PQCLEAN_SABER_AARCH64_asm_16_to_32(&(s_NTT[i][0]), &(sp[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + for (int j = 0; j < SABER_L; j++) { + NTT(&(A_NTT[i][j][0])); + } + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_asymmetric_mul(&(A_NTT[i][0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + } + + for (int i = 0; i < SABER_L; i++) { + iNTT(&(A_NTT[i][0][0])); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_round(bp[i], A_NTT[i][0]); + } + + + BS2POLVECp(pk, b); + BS2POLmsg(m, mp); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_16_to_32(&(b_NTT[i][0]), &(b[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_SABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_SABER_AARCH64_asm_enc_add_msg(vp, b_NTT[0], mp, h1); + + POLVECp2BS(ciphertext, bp); + POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); + + +} + +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]) { + + uint32_t b_NTT[SABER_L][SABER_N]; + uint32_t s_NTT[SABER_L][SABER_N]; + uint32_t s_NTT_asymmetric[SABER_L][SABER_N]; + + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + + BS2POLT(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, cm); + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_13_to_32(&(s_NTT[i][0]), sk + i * SABER_POLYBYTES); + } + + for (int i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AARCH64_asm_10_to_32(&(b_NTT[i][0]), ciphertext + i * (SABER_EP * SABER_N / 8)); + } + + for (int i = 0; i < SABER_L; i++) { + NTT_heavy(&(s_NTT_asymmetric[i][0]), &(s_NTT[i][0])); + } + + for (int i = 0; i < SABER_L; i++) { + NTT(&(b_NTT[i][0])); + } + + PQCLEAN_SABER_AARCH64_asm_asymmetric_mul(&(b_NTT[0][0]), &(s_NTT[0][0]), &(s_NTT_asymmetric[0][0]), constants); + + iNTT(&(b_NTT[0][0])); + + PQCLEAN_SABER_AARCH64_asm_dec_get_msg(v, b_NTT[0], cm, h2); + + POLmsg2BS(m, v); +} + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.h b/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.h new file mode 100644 index 0000000000..0b74c2fca0 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/SABER_indcpa.h @@ -0,0 +1,14 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include "SABER_params.h" +#include + +#define indcpa_kem_keypair SABER_NAMESPACE(indcpa_kem_keypair) +void indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); +#define indcpa_kem_enc SABER_NAMESPACE(indcpa_kem_enc) +void indcpa_kem_enc(const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t ciphertext[SABER_BYTES_CCA_DEC]); +#define indcpa_kem_dec SABER_NAMESPACE(indcpa_kem_dec) +void indcpa_kem_dec(const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint8_t m[SABER_KEYBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/SABER_params.h b/src/kem/saber/pqclean_saber_aarch64/SABER_params.h new file mode 100644 index 0000000000..a6c159f144 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/SABER_params.h @@ -0,0 +1,48 @@ +#ifndef PARAMS_H +#define PARAMS_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#define SABER_NAMESPACE(s) PQCLEAN_SABER_AARCH64_##s +#define SABER_L 3 + +/* Don't change anything below this line */ +#define SABER_MU 8 +#define SABER_ET 4 + +#define SABER_EQ 13 +#define SABER_EP 10 +#define SABER_N 256 + +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 + +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) + +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) + +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/__asm_NTT.S b/src/kem/saber/pqclean_saber_aarch64/__asm_NTT.S new file mode 100644 index 0000000000..2a00ba0996 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/__asm_NTT.S @@ -0,0 +1,309 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top +.global _PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top, %function +#endif +PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top: +_PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + src8 .req x8 + src9 .req x9 + src10 .req x10 + src11 .req x11 + src12 .req x12 + src13 .req x13 + src14 .req x14 + src15 .req x15 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add src1, src0, #64 + add src2, src0, #128 + + add src3, src0, #192 + add src4, src0, #256 + + add src5, src0, #320 + add src6, src0, #384 + + add src7, src0, #448 + add src8, src0, #512 + + add src9, src0, #576 + add src10, src0, #640 + + add src11, src0, #704 + add src12, src0, #768 + + add src13, src0, #832 + add src14, src0, #896 + + add src15, src0, #960 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [ src0] + ld1 { v2.4S}, [ src2] + ld1 { v4.4S}, [ src4] + ld1 { v6.4S}, [ src6] + ld1 { v8.4S}, [ src8] + ld1 {v10.4S}, [src10] + ld1 {v12.4S}, [src12] + ld1 {v14.4S}, [src14] + + ld1 { v1.4S}, [ src1] + ld1 { v3.4S}, [ src3] + ld1 { v5.4S}, [ src5] + ld1 { v7.4S}, [ src7] + ld1 { v9.4S}, [ src9] + ld1 {v11.4S}, [src11] + ld1 {v13.4S}, [src13] + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _ntt_top_loop: + + st1 { v0.4S}, [ src0], #16 + ld1 { v0.4S}, [ src0] + st1 { v2.4S}, [ src2], #16 + ld1 { v2.4S}, [ src2] + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v8.4S}, [ src8], #16 + ld1 { v8.4S}, [ src8] + st1 {v10.4S}, [src10], #16 + ld1 {v10.4S}, [src10] + st1 {v12.4S}, [src12], #16 + ld1 {v12.4S}, [src12] + st1 {v14.4S}, [src14], #16 + ld1 {v14.4S}, [src14] + + st1 { v1.4S}, [ src1], #16 + ld1 { v1.4S}, [ src1] + st1 { v3.4S}, [ src3], #16 + ld1 { v3.4S}, [ src3] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + st1 { v9.4S}, [ src9], #16 + ld1 { v9.4S}, [ src9] + st1 {v11.4S}, [src11], #16 + ld1 {v11.4S}, [src11] + st1 {v13.4S}, [src13], #16 + ld1 {v13.4S}, [src13] + st1 {v15.4S}, [src15], #16 + ld1 {v15.4S}, [src15] + + qq_butterfly_top v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v8, v10, v12, v14, v16, v17, v18, v19, v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v1, v3, v5, v7, v9, v11, v13, v15, v28, v29, v30, v31, v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v2, v8, v10, v4, v6, v12, v14, v16, v17, v18, v19, v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v1, v3, v9, v11, v5, v7, v13, v15, v28, v29, v30, v31, v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v4, v8, v12, v2, v6, v10, v14, v16, v17, v18, v19, v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_bot v1, v5, v9, v13, v3, v7, v11, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_top_loop + + st1 { v0.4S}, [ src0], #16 + st1 { v2.4S}, [ src2], #16 + st1 { v4.4S}, [ src4], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v8.4S}, [ src8], #16 + st1 {v10.4S}, [src10], #16 + st1 {v12.4S}, [src12], #16 + st1 {v14.4S}, [src14], #16 + + st1 { v1.4S}, [ src1], #16 + st1 { v3.4S}, [ src3], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v7.4S}, [ src7], #16 + st1 { v9.4S}, [ src9], #16 + st1 {v11.4S}, [src11], #16 + st1 {v13.4S}, [src13], #16 + st1 {v15.4S}, [src15], #16 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq src8 + .unreq src9 + .unreq src10 + .unreq src11 + .unreq src12 + .unreq src13 + .unreq src14 + .unreq src15 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot +.global _PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot, %function +#endif +PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot: +_PQCLEAN_SABER_AARCH64_asm_ntt_SIMD_bot: + + push_all + Q .req w20 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + table0 .req x27 + table1 .req x28 + counter .req x19 + + ldr Q, [x2] + + add table0, x1, #64 + add table1, x1, #320 + + add src1, src0, #0 + add src2, src0, #512 + add src3, src0, #512 + + ld1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src0], #64 + ld1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src2], #64 + ld1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src0], #64 + ld1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src2], #64 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + mov counter, #3 + _ntt_bot_loop: + + st1 { v0.4S}, [src1], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [src1], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [src1], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [src1], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [src1], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [src1], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [src1], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [src1], #16 + ld1 { v7.4S}, [src0], #16 + st1 { v8.4S}, [src3], #16 + ld1 { v8.4S}, [src2], #16 + st1 { v9.4S}, [src3], #16 + ld1 { v9.4S}, [src2], #16 + st1 {v10.4S}, [src3], #16 + ld1 {v10.4S}, [src2], #16 + st1 {v11.4S}, [src3], #16 + ld1 {v11.4S}, [src2], #16 + st1 {v12.4S}, [src3], #16 + ld1 {v12.4S}, [src2], #16 + st1 {v13.4S}, [src3], #16 + ld1 {v13.4S}, [src2], #16 + st1 {v14.4S}, [src3], #16 + ld1 {v14.4S}, [src2], #16 + st1 {v15.4S}, [src3], #16 + ld1 {v15.4S}, [src2], #16 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table0], #64 + ld1 {v24.4S, v25.4S, v26.4S, v27.4S}, [table1], #64 + + mov v20.S[0], Q + + qq_butterfly_top v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + qq_butterfly_mixed v0, v1, v2, v3, v4, v5, v6, v7, v16, v17, v18, v19, v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v20, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3 + qq_butterfly_mixed v8, v9, v10, v11, v12, v13, v14, v15, v28, v29, v30, v31, v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v20, v24, 2, 3, v24, 2, 3, v24, 2, 3, v24, 2, 3, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3 + qq_butterfly_mixed v0, v1, v4, v5, v2, v3, v6, v7, v16, v17, v18, v19, v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v20, v21, 0, 1, v21, 0, 1, v21, 2, 3, v21, 2, 3, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3 + qq_butterfly_mixed v8, v9, v12, v13, v10, v11, v14, v15, v28, v29, v30, v31, v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v20, v25, 0, 1, v25, 0, 1, v25, 2, 3, v25, 2, 3, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + qq_butterfly_mixed v0, v2, v4, v6, v1, v3, v5, v7, v16, v17, v18, v19, v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + qq_butterfly_bot v8, v10, v12, v14, v9, v11, v13, v15, v28, v29, v30, v31, v20, v26, 0, 1, v26, 2, 3, v27, 0, 1, v27, 2, 3 + + sub counter, counter, #1 + cbnz counter, _ntt_bot_loop + + st1 { v0.4S, v1.4S, v2.4S, v3.4S}, [src1], #64 + st1 { v8.4S, v9.4S, v10.4S, v11.4S}, [src3], #64 + st1 { v4.4S, v5.4S, v6.4S, v7.4S}, [src1], #64 + st1 {v12.4S, v13.4S, v14.4S, v15.4S}, [src3], #64 + + .unreq Q + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq table0 + .unreq table1 + pop_all + + br lr + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/__asm_iNTT.S b/src/kem/saber/pqclean_saber_aarch64/__asm_iNTT.S new file mode 100644 index 0000000000..e25fd8ef57 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/__asm_iNTT.S @@ -0,0 +1,472 @@ + +#include "macros.inc" + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top +.global _PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top, %function +#endif +PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top: +_PQCLEAN_SABER_AARCH64_asm_intt_SIMD_top: + + push_all + Q .req w20 + src0 .req x0 + des0 .req x1 + src1 .req x2 + des1 .req x3 + table .req x28 + counter .req x19 + + ldr Q, [x2] + + mov table, x1 + + add des0, src0, #0 + add src1, src0, #512 + add des1, src0, #512 + + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [table], #64 + + mov v20.S[0], Q + + ld1 { v0.4S}, [src0], #16 + ld1 { v1.4S}, [src0], #16 + ld1 { v2.4S}, [src0], #16 + ld1 { v3.4S}, [src0], #16 + ld1 { v4.4S}, [src0], #16 + ld1 { v5.4S}, [src0], #16 + ld1 { v6.4S}, [src0], #16 + ld1 { v7.4S}, [src0], #16 + + ld1 { v8.4S}, [src1], #16 + ld1 { v9.4S}, [src1], #16 + ld1 {v10.4S}, [src1], #16 + ld1 {v11.4S}, [src1], #16 + ld1 {v12.4S}, [src1], #16 + ld1 {v13.4S}, [src1], #16 + ld1 {v14.4S}, [src1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + mov counter, #3 + _intt_top_loop: + + st1 { v0.4S}, [des0], #16 + ld1 { v0.4S}, [src0], #16 + st1 { v1.4S}, [des0], #16 + ld1 { v1.4S}, [src0], #16 + st1 { v2.4S}, [des0], #16 + ld1 { v2.4S}, [src0], #16 + st1 { v3.4S}, [des0], #16 + ld1 { v3.4S}, [src0], #16 + st1 { v4.4S}, [des0], #16 + ld1 { v4.4S}, [src0], #16 + st1 { v5.4S}, [des0], #16 + ld1 { v5.4S}, [src0], #16 + st1 { v6.4S}, [des0], #16 + ld1 { v6.4S}, [src0], #16 + st1 { v7.4S}, [des0], #16 + ld1 { v7.4S}, [src0], #16 + + st1 { v8.4S}, [des1], #16 + ld1 { v8.4S}, [src1], #16 + st1 { v9.4S}, [des1], #16 + ld1 { v9.4S}, [src1], #16 + st1 {v10.4S}, [des1], #16 + ld1 {v10.4S}, [src1], #16 + st1 {v11.4S}, [des1], #16 + ld1 {v11.4S}, [src1], #16 + st1 {v12.4S}, [des1], #16 + ld1 {v12.4S}, [src1], #16 + st1 {v13.4S}, [des1], #16 + ld1 {v13.4S}, [src1], #16 + st1 {v14.4S}, [des1], #16 + ld1 {v14.4S}, [src1], #16 + st1 {v15.4S}, [des1], #16 + ld1 {v15.4S}, [src1], #16 + + qq_add_sub v16, v17, v18, v19, v1, v3, v5, v7, v0, v2, v4, v6, v1, v3, v5, v7 + qq_add_sub v28, v29, v30, v31, v9, v11, v13, v15, v8, v10, v12, v14, v9, v11, v13, v15 + + qq_add_sub v0, v4, v8, v12, v2, v6, v10, v14, v16, v18, v28, v30, v17, v19, v29, v31 + + dq_butterfly_top v1, v5, v3, v7, v16, v17, v20, v21, 2, 3, v21, 2, 3 + + dq_butterfly_mixed v1, v5, v3, v7, v16, v17, v9, v13, v11, v15, v18, v19, v20, v21, 2, 3, v21, 2, 3, v21, 2, 3, v21, 2, 3 + dq_butterfly_mixed v9, v13, v11, v15, v18, v19, v0, v1, v4, v5, v28, v29, v20, v21, 2, 3, v21, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v0, v1, v4, v5, v28, v29, v2, v3, v6, v7, v30, v31, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v30, v31, v8, v9, v12, v13, v16, v17, v20, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_mixed v8, v9, v12, v13, v16, v17, v10, v11, v14, v15, v18, v19, v20, v22, 0, 1, v22, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_bot v10, v11, v14, v15, v18, v19, v20, v23, 0, 1, v23, 2, 3 + + sub counter, counter, #1 + cbnz counter, _intt_top_loop + + st1 { v0.4S}, [des0], #16 + st1 { v1.4S}, [des0], #16 + st1 { v2.4S}, [des0], #16 + st1 { v3.4S}, [des0], #16 + st1 { v4.4S}, [des0], #16 + st1 { v5.4S}, [des0], #16 + st1 { v6.4S}, [des0], #16 + st1 { v7.4S}, [des0], #16 + + st1 { v8.4S}, [des1], #16 + st1 { v9.4S}, [des1], #16 + st1 {v10.4S}, [des1], #16 + st1 {v11.4S}, [des1], #16 + st1 {v12.4S}, [des1], #16 + st1 {v13.4S}, [des1], #16 + st1 {v14.4S}, [des1], #16 + st1 {v15.4S}, [des1], #16 + + .unreq Q + .unreq src0 + .unreq des0 + .unreq src1 + .unreq des1 + .unreq table + .unreq counter + pop_all + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot +.global _PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot, %function +#endif +PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot: +_PQCLEAN_SABER_AARCH64_asm_intt_SIMD_bot: + + push_all + Q .req w20 + Qhalf .req w21 + nQhalf .req w22 + src0 .req x0 + src1 .req x1 + src2 .req x2 + src3 .req x3 + src4 .req x4 + src5 .req x5 + src6 .req x6 + src7 .req x7 + table .req x28 + twistT0 .req x8 + twistT1 .req x9 + twistT2 .req x10 + twistT3 .req x11 + twistT4 .req x12 + twistT5 .req x13 + twistT6 .req x14 + twistT7 .req x15 + counter .req x19 + + add twistT0, x3, #256*0 + add twistT1, x3, #256*1 + add twistT2, x3, #256*2 + add twistT3, x3, #256*3 + add twistT4, x3, #256*4 + add twistT5, x3, #256*5 + add twistT6, x3, #256*6 + add twistT7, x3, #256*7 + + ldr Q, [x2] + lsr Qhalf, Q, #1 + neg nQhalf, Qhalf + + add table, x1, #64 + + add src1, src0, #128 + add src2, src0, #256 + add src3, src0, #384 + add src4, src0, #512 + add src5, src0, #640 + add src6, src0, #768 + add src7, src0, #896 + + ld1 { v0.4S}, [ src0] + ld1 { v1.4S}, [ src1] + ld1 { v2.4S}, [ src2] + ld1 { v3.4S}, [ src3] + ld1 { v4.4S}, [ src4] + ld1 { v5.4S}, [ src5] + ld1 { v6.4S}, [ src6] + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + mov counter, #7 + _intt_bot_loop: + + cmge v22.4S, v26.4S, v6.4S + ld1 { v0.4S}, [ src0] + cmge v23.4S, v26.4S, v7.4S + ld1 { v1.4S}, [ src1] + cmgt v20.4S, v6.4S, v25.4S + ld1 { v2.4S}, [ src2] + cmgt v21.4S, v7.4S, v25.4S + ld1 { v3.4S}, [ src3] + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + ld1 { v4.4S}, [ src4] + st1 { v5.4S}, [ src5], #16 + ld1 { v5.4S}, [ src5] + st1 { v6.4S}, [ src6], #16 + ld1 { v6.4S}, [ src6] + st1 { v7.4S}, [ src7], #16 + ld1 { v7.4S}, [ src7] + + ld1 {v20.4S}, [table], #16 + ld1 {v21.4S}, [table], #16 + ld1 {v22.4S}, [table], #16 + ld1 {v23.4S}, [table], #16 + + dup v24.4S, Q + dup v25.4S, Qhalf + dup v26.4S, nQhalf + + dq_butterfly_top v4, v6, v5, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v4, v6, v5, v7, v18, v19, v0, v2, v1, v3, v16, v17, v24, v20, 2, 3, v20, 2, 3, v20, 2, 3, v20, 2, 3 + dq_butterfly_mixed v0, v2, v1, v3, v16, v17, v4, v5, v6, v7, v18, v19, v24, v20, 2, 3, v20, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v4, v5, v6, v7, v18, v19, v0, v1, v2, v3, v16, v17, v24, v21, 0, 1, v21, 2, 3, v21, 0, 1, v21, 2, 3 + dq_butterfly_mixed v0, v1, v2, v3, v16, v17, v2, v3, v6, v7, v18, v19, v24, v21, 0, 1, v21, 2, 3, v23, 0, 1, v23, 2, 3 + dq_butterfly_mixed v2, v3, v6, v7, v18, v19, v0, v1, v4, v5, v16, v17, v24, v23, 0, 1, v23, 2, 3, v22, 0, 1, v22, 2, 3 + dq_butterfly_bot v0, v1, v4, v5, v16, v17, v24, v22, 0, 1, v22, 2, 3 + + ld2 { v8.4S, v9.4S}, [twistT0], #32 + ld2 {v10.4S, v11.4S}, [twistT1], #32 + ld2 {v12.4S, v13.4S}, [twistT2], #32 + ld2 {v14.4S, v15.4S}, [twistT3], #32 + + sqrdmulh v16.4S, v0.4S, v8.4S + sqrdmulh v17.4S, v1.4S, v10.4S + sqrdmulh v18.4S, v2.4S, v12.4S + sqrdmulh v19.4S, v3.4S, v14.4S + + mul v0.4S, v0.4S, v9.4S + mul v1.4S, v1.4S, v11.4S + mul v2.4S, v2.4S, v13.4S + mul v3.4S, v3.4S, v15.4S + + mls v0.4S, v16.4S, v24.4S + ld2 { v8.4S, v9.4S}, [twistT4], #32 + mls v1.4S, v17.4S, v24.4S + ld2 {v10.4S, v11.4S}, [twistT5], #32 + mls v2.4S, v18.4S, v24.4S + ld2 {v12.4S, v13.4S}, [twistT6], #32 + mls v3.4S, v19.4S, v24.4S + ld2 {v14.4S, v15.4S}, [twistT7], #32 + + cmge v18.4S, v26.4S, v0.4S + sqrdmulh v20.4S, v4.4S, v8.4S + cmge v19.4S, v26.4S, v1.4S + sqrdmulh v21.4S, v5.4S, v10.4S + cmgt v16.4S, v0.4S, v25.4S + sqrdmulh v22.4S, v6.4S, v12.4S + cmgt v17.4S, v1.4S, v25.4S + sqrdmulh v23.4S, v7.4S, v14.4S + + sub v16.4S, v16.4S, v18.4S + mul v4.4S, v4.4S, v9.4S + sub v17.4S, v17.4S, v19.4S + mul v5.4S, v5.4S, v11.4S + + mla v0.4S, v16.4S, v24.4S + mul v6.4S, v6.4S, v13.4S + mla v1.4S, v17.4S, v24.4S + mul v7.4S, v7.4S, v15.4S + + cmge v18.4S, v26.4S, v2.4S + mls v4.4S, v20.4S, v24.4S + cmge v19.4S, v26.4S, v3.4S + mls v5.4S, v21.4S, v24.4S + cmgt v16.4S, v2.4S, v25.4S + mls v6.4S, v22.4S, v24.4S + cmgt v17.4S, v3.4S, v25.4S + mls v7.4S, v23.4S, v24.4S + + sub v16.4S, v16.4S, v18.4S + cmge v22.4S, v26.4S, v4.4S + sub v17.4S, v17.4S, v19.4S + cmge v23.4S, v26.4S, v5.4S + + mla v2.4S, v16.4S, v24.4S + cmgt v20.4S, v4.4S, v25.4S + mla v3.4S, v17.4S, v24.4S + cmgt v21.4S, v5.4S, v25.4S + + st1 { v0.4S}, [ src0], #16 + sub v20.4S, v20.4S, v22.4S + st1 { v1.4S}, [ src1], #16 + sub v21.4S, v21.4S, v23.4S + st1 { v2.4S}, [ src2], #16 + mla v4.4S, v20.4S, v24.4S + st1 { v3.4S}, [ src3], #16 + mla v5.4S, v21.4S, v24.4S + + sub counter, counter, #1 + cbnz counter, _intt_bot_loop + + cmge v22.4S, v26.4S, v6.4S + cmge v23.4S, v26.4S, v7.4S + cmgt v20.4S, v6.4S, v25.4S + cmgt v21.4S, v7.4S, v25.4S + + sub v20.4S, v20.4S, v22.4S + sub v21.4S, v21.4S, v23.4S + + mla v6.4S, v20.4S, v24.4S + mla v7.4S, v21.4S, v24.4S + + st1 { v4.4S}, [ src4], #16 + st1 { v5.4S}, [ src5], #16 + st1 { v6.4S}, [ src6], #16 + st1 { v7.4S}, [ src7], #16 + + .unreq Q + .unreq Qhalf + .unreq nQhalf + .unreq src0 + .unreq src1 + .unreq src2 + .unreq src3 + .unreq src4 + .unreq src5 + .unreq src6 + .unreq src7 + .unreq table + .unreq twistT0 + .unreq twistT1 + .unreq twistT2 + .unreq twistT3 + .unreq twistT4 + .unreq twistT5 + .unreq twistT6 + .unreq twistT7 + .unreq counter + pop_all + + br lr + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/__asm_mul.S b/src/kem/saber/pqclean_saber_aarch64/__asm_mul.S new file mode 100644 index 0000000000..5b0a471521 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/__asm_mul.S @@ -0,0 +1,255 @@ + +#include "macros.inc" +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_asymmetric_mul +.global _PQCLEAN_SABER_AARCH64_asm_asymmetric_mul +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_asymmetric_mul, %function +#endif +PQCLEAN_SABER_AARCH64_asm_asymmetric_mul: +_PQCLEAN_SABER_AARCH64_asm_asymmetric_mul: + + push_all + + ldr w28, [x3, #0] + ldr w27, [x3, #4] + + dup v28.4S, w28 + dup v29.4S, w27 + + add x11, x0, #0 + + add x4, x0, #1024 + add x5, x1, #1024 + add x6, x2, #1024 + +.if SABER_L > 2 + add x8, x0, #2048 + add x9, x1, #2048 + add x10, x2, #2048 +.endif + +.if SABER_L > 3 + add x12, x0, #3072 + add x13, x1, #3072 + add x14, x2, #3072 +.endif + + mov x16, #16 + _asymmetric_loop: + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x0], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x1], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x2], #64 + + _4x4_asymmetric smull, smull2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x4], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x5], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [ x6], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 + +.if SABER_L > 2 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [ x8], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [ x9], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x10], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + +.if SABER_L > 3 + ld4 { v0.4S, v1.4S, v2.4S, v3.4S}, [x12], #64 + ld4 { v4.4S, v5.4S, v6.4S, v7.4S}, [x13], #64 + ld4 { v8.4S, v9.4S, v10.4S, v11.4S}, [x14], #64 + + _4x4_asymmetric smlal, smlal2, v3, v9, v10, v11, v4, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v2, v10, v11, v4, v5, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v1, v11, v4, v5, v6, v16, v20, v17, v21, v18, v22, v19, v23 + _4x4_asymmetric smlal, smlal2, v0, v4, v5, v6, v7, v16, v20, v17, v21, v18, v22, v19, v23 +.endif + + qq_montgomery v24, v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v0, v1, v2, v3, v29, v28 + + st4 {v24.4S, v25.4S, v26.4S, v27.4S}, [x11], #64 + + sub x16, x16, #1 + cbnz x16, _asymmetric_loop + + pop_all + + br lr + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_point_mul_extended +.global _PQCLEAN_SABER_AARCH64_asm_point_mul_extended +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_point_mul_extended, %function +#endif +PQCLEAN_SABER_AARCH64_asm_point_mul_extended: +_PQCLEAN_SABER_AARCH64_asm_point_mul_extended: + + push_all + + ldr w20, [x3] + + ld1 { v0.4S}, [x1], #16 + ld1 { v1.4S}, [x1], #16 + ld1 { v2.4S}, [x1], #16 + ld1 { v3.4S}, [x1], #16 + + ld2 { v4.4S, v5.4S}, [x2], #32 + ld2 { v6.4S, v7.4S}, [x2], #32 + ld2 { v8.4S, v9.4S}, [x2], #32 + ld2 {v10.4S, v11.4S}, [x2], #32 + + sqrdmulh v12.4S, v0.4S, v4.4S + sqrdmulh v13.4S, v1.4S, v6.4S + sqrdmulh v14.4S, v2.4S, v8.4S + sqrdmulh v15.4S, v3.4S, v10.4S + + mov x16, #7 + _point_mul_loop: + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + ld1 { v0.4S}, [x1], #16 + mul v17.4S, v17.4S, v23.4S + ld1 { v1.4S}, [x1], #16 + mul v18.4S, v18.4S, v25.4S + ld1 { v2.4S}, [x1], #16 + mul v19.4S, v19.4S, v27.4S + ld1 { v3.4S}, [x1], #16 + + mls v16.4S, v28.4S, v20.4S + ld2 { v4.4S, v5.4S}, [x2], #32 + mls v17.4S, v29.4S, v20.4S + ld2 { v6.4S, v7.4S}, [x2], #32 + mls v18.4S, v30.4S, v20.4S + ld2 { v8.4S, v9.4S}, [x2], #32 + mls v19.4S, v31.4S, v20.4S + ld2 {v10.4S, v11.4S}, [x2], #32 + + st1 {v16.4S}, [x0], #16 + sqrdmulh v12.4S, v0.4S, v4.4S + st1 {v17.4S}, [x0], #16 + sqrdmulh v13.4S, v1.4S, v6.4S + st1 {v18.4S}, [x0], #16 + sqrdmulh v14.4S, v2.4S, v8.4S + st1 {v19.4S}, [x0], #16 + sqrdmulh v15.4S, v3.4S, v10.4S + + sub x16, x16, #1 + cbnz x16, _point_mul_loop + + dup v4.4S, w20 + + mul v0.4S, v0.4S, v5.4S + ld1 {v16.4S}, [x1], #16 + mul v1.4S, v1.4S, v7.4S + ld1 {v17.4S}, [x1], #16 + mul v2.4S, v2.4S, v9.4S + ld1 {v18.4S}, [x1], #16 + mul v3.4S, v3.4S, v11.4S + ld1 {v19.4S}, [x1], #16 + + mls v0.4S, v12.4S, v4.4S + ld2 {v20.4S, v21.4S}, [x2], #32 + mls v1.4S, v13.4S, v4.4S + ld2 {v22.4S, v23.4S}, [x2], #32 + mls v2.4S, v14.4S, v4.4S + ld2 {v24.4S, v25.4S}, [x2], #32 + mls v3.4S, v15.4S, v4.4S + ld2 {v26.4S, v27.4S}, [x2], #32 + + st1 { v0.4S}, [x0], #16 + sqrdmulh v28.4S, v16.4S, v20.4S + st1 { v1.4S}, [x0], #16 + sqrdmulh v29.4S, v17.4S, v22.4S + st1 { v2.4S}, [x0], #16 + sqrdmulh v30.4S, v18.4S, v24.4S + st1 { v3.4S}, [x0], #16 + sqrdmulh v31.4S, v19.4S, v26.4S + + dup v20.4S, w20 + + mul v16.4S, v16.4S, v21.4S + mul v17.4S, v17.4S, v23.4S + mul v18.4S, v18.4S, v25.4S + mul v19.4S, v19.4S, v27.4S + + mls v16.4S, v28.4S, v20.4S + mls v17.4S, v29.4S, v20.4S + mls v18.4S, v30.4S, v20.4S + mls v19.4S, v31.4S, v20.4S + + st1 {v16.4S}, [x0], #16 + st1 {v17.4S}, [x0], #16 + st1 {v18.4S}, [x0], #16 + st1 {v19.4S}, [x0], #16 + + pop_all + + br lr + + + + + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/__asm_narrow.S b/src/kem/saber/pqclean_saber_aarch64/__asm_narrow.S new file mode 100644 index 0000000000..cc967e45ec --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/__asm_narrow.S @@ -0,0 +1,247 @@ + +#include "SABER_params.h" + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_round +.global _PQCLEAN_SABER_AARCH64_asm_round +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_round, %function +#endif +PQCLEAN_SABER_AARCH64_asm_round: +_PQCLEAN_SABER_AARCH64_asm_round: + + + .equ srv, (SABER_EQ-SABER_EP) + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + mov x7, #7 + _round_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + + srshr v0.8H, v0.8H, #srv + srshr v2.8H, v2.8H, #srv + srshr v4.8H, v4.8H, #srv + srshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _round_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_enc_add_msg +.global _PQCLEAN_SABER_AARCH64_asm_enc_add_msg +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_enc_add_msg, %function +#endif +PQCLEAN_SABER_AARCH64_asm_enc_add_msg: +_PQCLEAN_SABER_AARCH64_asm_enc_add_msg: + + .equ srv, (SABER_EP-SABER_ET) + .equ slv, (SABER_EP-1) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _enc_add_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _enc_add_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_dec_get_msg +.global _PQCLEAN_SABER_AARCH64_asm_dec_get_msg +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_dec_get_msg, %function +#endif +PQCLEAN_SABER_AARCH64_asm_dec_get_msg: +_PQCLEAN_SABER_AARCH64_asm_dec_get_msg: + + .equ srv, (SABER_EP-1) + .equ slv, (SABER_EP-SABER_ET) + + dup v30.8H, w3 + + ld2 { v0.8H, v1.8H}, [x1], #32 + ld2 { v2.8H, v3.8H}, [x1], #32 + ld2 { v4.8H, v5.8H}, [x1], #32 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + mov x7, #7 + _dec_get_msg_loop: + + st1 { v0.8H}, [x0], #16 + ld2 { v0.8H, v1.8H}, [x1], #32 + st1 { v2.8H}, [x0], #16 + ld2 { v2.8H, v3.8H}, [x1], #32 + st1 { v4.8H}, [x0], #16 + ld2 { v4.8H, v5.8H}, [x1], #32 + st1 { v6.8H}, [x0], #16 + ld2 { v6.8H, v7.8H}, [x1], #32 + ld1 { v1.8H}, [x2], #16 + ld1 { v3.8H}, [x2], #16 + ld1 { v5.8H}, [x2], #16 + ld1 { v7.8H}, [x2], #16 + + add v0.8H, v0.8H, v30.8H + add v2.8H, v2.8H, v30.8H + add v4.8H, v4.8H, v30.8H + add v6.8H, v6.8H, v30.8H + + shl v1.8H, v1.8H, #slv + shl v3.8H, v3.8H, #slv + shl v5.8H, v5.8H, #slv + shl v7.8H, v7.8H, #slv + + sub v0.8H, v0.8H, v1.8H + sub v2.8H, v2.8H, v3.8H + sub v4.8H, v4.8H, v5.8H + sub v6.8H, v6.8H, v7.8H + + sshr v0.8H, v0.8H, #srv + sshr v2.8H, v2.8H, #srv + sshr v4.8H, v4.8H, #srv + sshr v6.8H, v6.8H, #srv + + sub x7, x7, #1 + cbnz x7, _dec_get_msg_loop + + st1 { v0.8H}, [x0], #16 + st1 { v2.8H}, [x0], #16 + st1 { v4.8H}, [x0], #16 + st1 { v6.8H}, [x0], #16 + + br lr + + + + + + + + + + + + + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/__asm_pack_unpack.S b/src/kem/saber/pqclean_saber_aarch64/__asm_pack_unpack.S new file mode 100644 index 0000000000..f972b66664 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/__asm_pack_unpack.S @@ -0,0 +1,345 @@ + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_1_to_16 +.global _PQCLEAN_SABER_AARCH64_asm_1_to_16 +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_1_to_16, %function +#endif +PQCLEAN_SABER_AARCH64_asm_1_to_16: +_PQCLEAN_SABER_AARCH64_asm_1_to_16: + + mov x15, #8 + _1_to_16_outer_loop: + + ldr w2, [x1], #4 + + mov x11, #4 + _1_to_16_inner_loop: + + sbfx w3, w2, #0, #1 + strh w3, [x0], #2 + sbfx w4, w2, #1, #1 + strh w4, [x0], #2 + sbfx w5, w2, #2, #1 + strh w5, [x0], #2 + sbfx w6, w2, #3, #1 + strh w6, [x0], #2 + sbfx w7, w2, #4, #1 + strh w7, [x0], #2 + sbfx w8, w2, #5, #1 + strh w8, [x0], #2 + sbfx w9, w2, #6, #1 + strh w9, [x0], #2 + sbfx w10, w2, #7, #1 + strh w10, [x0], #2 + + lsr w2, w2, #8 + + sub x11, x11, #1 + cbnz x11, _1_to_16_inner_loop + + sub x15, x15, #1 + cbnz x15, _1_to_16_outer_loop + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_4_to_16 +.global _PQCLEAN_SABER_AARCH64_asm_4_to_16 +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_4_to_16, %function +#endif +PQCLEAN_SABER_AARCH64_asm_4_to_16: +_PQCLEAN_SABER_AARCH64_asm_4_to_16: + + mov x7, #32 + _4_to_16_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + strh w3, [x0], #2 + sbfx w4, w2, #4, #4 + strh w4, [x0], #2 + sbfx w5, w2, #8, #4 + strh w5, [x0], #2 + sbfx w6, w2, #12, #4 + strh w6, [x0], #2 + + sbfx w3, w2, #16, #4 + strh w3, [x0], #2 + sbfx w4, w2, #20, #4 + strh w4, [x0], #2 + sbfx w5, w2, #24, #4 + strh w5, [x0], #2 + sbfx w6, w2, #28, #4 + strh w6, [x0], #2 + + sub x7, x7, #1 + cbnz x7, _4_to_16_loop + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_10_to_32 +.global _PQCLEAN_SABER_AARCH64_asm_10_to_32 +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_10_to_32, %function +#endif +PQCLEAN_SABER_AARCH64_asm_10_to_32: +_PQCLEAN_SABER_AARCH64_asm_10_to_32: + + mov x7, #16 + _10_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + str w3, [x0], #4 + sbfx w4, w2, #10, #10 + str w4, [x0], #4 + sbfx w5, w2, #20, #10 + str w5, [x0], #4 + lsr w6, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #2 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #8, #10 + str w4, [x0], #4 + sbfx w5, w2, #18, #10 + str w5, [x0], #4 + lsr w6, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #4 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #6, #10 + str w4, [x0], #4 + sbfx w5, w2, #16, #10 + str w5, [x0], #4 + lsr w6, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #6 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #4, #10 + str w4, [x0], #4 + sbfx w5, w2, #14, #10 + str w5, [x0], #4 + lsr w6, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #8 + orr w3, w3, w6 + str w3, [x0], #4 + sbfx w4, w2, #2, #10 + str w4, [x0], #4 + sbfx w5, w2, #12, #10 + str w5, [x0], #4 + sbfx w6, w2, #22, #10 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _10_to_32_loop + + br lr + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_13_to_32 +.global _PQCLEAN_SABER_AARCH64_asm_13_to_32 +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_13_to_32, %function +#endif +PQCLEAN_SABER_AARCH64_asm_13_to_32: +_PQCLEAN_SABER_AARCH64_asm_13_to_32: + + mov x7, #8 + _13_to_32_loop: + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #13 + str w3, [x0], #4 + sbfx w4, w2, #13, #13 + str w4, [x0], #4 + lsr w5, w2, #26 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #7 + lsl w3, w3, #6 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #7, #13 + str w4, [x0], #4 + lsr w5, w2, #20 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #1 + lsl w3, w3, #12 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #1, #13 + str w4, [x0], #4 + sbfx w5, w2, #14, #13 + str w5, [x0], #4 + lsr w5, w2, #27 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #8 + lsl w3, w3, #5 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #8, #13 + str w4, [x0], #4 + lsr w5, w2, #21 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #2 + lsl w3, w3, #11 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #2, #13 + str w4, [x0], #4 + sbfx w5, w2, #15, #13 + str w5, [x0], #4 + lsr w5, w2, #28 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #9 + lsl w3, w3, #4 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #9, #13 + str w4, [x0], #4 + lsr w5, w2, #22 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #3 + lsl w3, w3, #10 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #3, #13 + str w4, [x0], #4 + sbfx w5, w2, #16, #13 + str w5, [x0], #4 + lsr w5, w2, #29 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #10 + lsl w3, w3, #3 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #10, #13 + str w4, [x0], #4 + lsr w5, w2, #23 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #4 + lsl w3, w3, #9 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #4, #13 + str w4, [x0], #4 + sbfx w5, w2, #17, #13 + str w5, [x0], #4 + lsr w5, w2, #30 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #11 + lsl w3, w3, #2 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #11, #13 + str w4, [x0], #4 + lsr w5, w2, #24 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #5 + lsl w3, w3, #8 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #5, #13 + str w4, [x0], #4 + sbfx w5, w2, #18, #13 + str w5, [x0], #4 + lsr w5, w2, #31 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #12 + lsl w3, w3, #1 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #12, #13 + str w4, [x0], #4 + lsr w5, w2, #25 + + ldr w2, [x1], #4 + + sbfx w3, w2, #0, #6 + lsl w3, w3, #7 + orr w3, w3, w5 + str w3, [x0], #4 + sbfx w4, w2, #6, #13 + str w4, [x0], #4 + sbfx w5, w2, #19, #13 + str w5, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _13_to_32_loop + + br lr + + +.align 2 +.global PQCLEAN_SABER_AARCH64_asm_16_to_32 +.global _PQCLEAN_SABER_AARCH64_asm_16_to_32 +#ifndef __clang__ +.type PQCLEAN_SABER_AARCH64_asm_16_to_32, %function +#endif +PQCLEAN_SABER_AARCH64_asm_16_to_32: +_PQCLEAN_SABER_AARCH64_asm_16_to_32: + + mov x7, #128 + _sbfx_loop: + + ldr w4, [x1], #4 + sbfx w5, w4, #0, #13 + sbfx w6, w4, #16, #13 + str w5, [x0], #4 + str w6, [x0], #4 + + sub x7, x7, #1 + cbnz x7, _sbfx_loop + + br lr + + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/api.h b/src/kem/saber/pqclean_saber_aarch64/api.h new file mode 100644 index 0000000000..4243736eab --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_SABER_AARCH64_API_H +#define PQCLEAN_SABER_AARCH64_API_H + + +#define PQCLEAN_SABER_AARCH64_CRYPTO_ALGNAME "Saber" +#define PQCLEAN_SABER_AARCH64_CRYPTO_BYTES 32 +#define PQCLEAN_SABER_AARCH64_CRYPTO_CIPHERTEXTBYTES 1088 +#define PQCLEAN_SABER_AARCH64_CRYPTO_PUBLICKEYBYTES 992 +#define PQCLEAN_SABER_AARCH64_CRYPTO_SECRETKEYBYTES 2304 + +int PQCLEAN_SABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_SABER_AARCH64_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_SABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_SABER_AARCH64_API_H */ diff --git a/src/kem/saber/pqclean_saber_aarch64/cbd.c b/src/kem/saber/pqclean_saber_aarch64/cbd.c new file mode 100644 index 0000000000..ce0665385f --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/cbd.c @@ -0,0 +1,155 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. + * Copyright (c) 2020 by Cryptographic Engineering Research Group (CERG) + * ECE Department, George Mason University + * Fairfax, VA, U.S.A. + * Author: Duc Tri Nguyen +=============================================================================*/ + +#include "cbd.h" +#include + +#define vload4(c, ptr) c = vld4q_u8(ptr); +#define vstore4(ptr, c) vst4q_u16(ptr, c); + +// c = a & b +#define vand8(c, a, b) c = vandq_u8(a, b); + +// c = a >> n +#define vsr8(c, a, n) c = vshrq_n_u8(a, n); + +// c = a + b +#define vadd8(c, a, b) c = vaddq_u8(a, b); + +// low c = (uint16_t) (a - b) +#define vsubll8(c, a, b) c = vsubl_u8(a, b); + +// high c = (uint16_t) (a - b) +#define vsublh8(c, a, b) c = vsubl_high_u8(a, b); + + + +static +void neon_cbd4(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + uint8x16x4_t neon_buf, res, tmp, a, b; // 20 + uint16x8x4_t store1, store2; // 8 + + uint8x16_t const_0x11, const_0xf; // 2 + const_0x11 = vdupq_n_u8(0x11); + const_0xf = vdupq_n_u8(0xf); + + // Total SIMD register: 30 + for (int i = 0; i < SABER_POLYCOINBYTES; i += 4 * 16) { + // 0, 4, ... + // 1, 5, ... + // 2, 6, ... + // 3, 7, ... + vload4(neon_buf, &buf[i]); + + // d = t + vand8(res.val[0], neon_buf.val[0], const_0x11); + vand8(res.val[1], neon_buf.val[1], const_0x11); + vand8(res.val[2], neon_buf.val[2], const_0x11); + vand8(res.val[3], neon_buf.val[3], const_0x11); + + // d += (t >> 1) & 0x11 + vsr8(tmp.val[0], neon_buf.val[0], 1); + vsr8(tmp.val[1], neon_buf.val[1], 1); + vsr8(tmp.val[2], neon_buf.val[2], 1); + vsr8(tmp.val[3], neon_buf.val[3], 1); + + vand8(tmp.val[0], tmp.val[0], const_0x11); + vand8(tmp.val[1], tmp.val[1], const_0x11); + vand8(tmp.val[2], tmp.val[2], const_0x11); + vand8(tmp.val[3], tmp.val[3], const_0x11); + + vadd8(res.val[0], res.val[0], tmp.val[0]); + vadd8(res.val[1], res.val[1], tmp.val[1]); + vadd8(res.val[2], res.val[2], tmp.val[2]); + vadd8(res.val[3], res.val[3], tmp.val[3]); + + // d += (t >> 2) & 0x11 + vsr8(tmp.val[0], neon_buf.val[0], 2); + vsr8(tmp.val[1], neon_buf.val[1], 2); + vsr8(tmp.val[2], neon_buf.val[2], 2); + vsr8(tmp.val[3], neon_buf.val[3], 2); + + vand8(tmp.val[0], tmp.val[0], const_0x11); + vand8(tmp.val[1], tmp.val[1], const_0x11); + vand8(tmp.val[2], tmp.val[2], const_0x11); + vand8(tmp.val[3], tmp.val[3], const_0x11); + + vadd8(res.val[0], res.val[0], tmp.val[0]); + vadd8(res.val[1], res.val[1], tmp.val[1]); + vadd8(res.val[2], res.val[2], tmp.val[2]); + vadd8(res.val[3], res.val[3], tmp.val[3]); + + // d += (t >> 3) & 0x11 + vsr8(tmp.val[0], neon_buf.val[0], 3); + vsr8(tmp.val[1], neon_buf.val[1], 3); + vsr8(tmp.val[2], neon_buf.val[2], 3); + vsr8(tmp.val[3], neon_buf.val[3], 3); + + vand8(tmp.val[0], tmp.val[0], const_0x11); + vand8(tmp.val[1], tmp.val[1], const_0x11); + vand8(tmp.val[2], tmp.val[2], const_0x11); + vand8(tmp.val[3], tmp.val[3], const_0x11); + + vadd8(res.val[0], res.val[0], tmp.val[0]); + vadd8(res.val[1], res.val[1], tmp.val[1]); + vadd8(res.val[2], res.val[2], tmp.val[2]); + vadd8(res.val[3], res.val[3], tmp.val[3]); + + // Get a + // 0, 4, 8 , 12 -- 16, 20, 24, 28 + // 1, 5, 9 , 13 -- 17, 21, 25, 29 + // 2, 6, 10, 14 -- 18, 22, 26, 30 + // 3, 7, 11, 15 -- 19, 23, 27, 31 + vand8(a.val[0], res.val[0], const_0xf); + vand8(a.val[1], res.val[1], const_0xf); + vand8(a.val[2], res.val[2], const_0xf); + vand8(a.val[3], res.val[3], const_0xf); + + // Get b + // 0, 4, 8 , 12 -- 16, 20, 24, 28 + // 1, 5, 9 , 13 -- 17, 21, 25, 29 + // 2, 6, 10, 14 -- 18, 22, 26, 30 + // 3, 7, 11, 15 -- 19, 23, 27, 31 + vsr8(b.val[0], res.val[0], 4); + vsr8(b.val[1], res.val[1], 4); + vsr8(b.val[2], res.val[2], 4); + vsr8(b.val[3], res.val[3], 4); + + // a - b + // 0, 4, 8 , 12 -- 16, 20, 24, 28 | 0 + // 1, 5, 9 , 13 -- 17, 21, 25, 29 | 1 + // 2, 6, 10, 14 -- 18, 22, 26, 30 | 2 + // 3, 7, 11, 15 -- 19, 23, 27, 31 | 3 + vsubll8(store1.val[0], vget_low_u8(a.val[0]), vget_low_u8(b.val[0])); + vsubll8(store1.val[1], vget_low_u8(a.val[1]), vget_low_u8(b.val[1])); + vsubll8(store1.val[2], vget_low_u8(a.val[2]), vget_low_u8(b.val[2])); + vsubll8(store1.val[3], vget_low_u8(a.val[3]), vget_low_u8(b.val[3])); + + // a - b + // 32, 36, 40, 44 -- 48, 52, 56, 60 + // 33, 37, 41, 45 -- 49, 53, 57, 61 + // 34, 38, 42, 46 -- 50, 54, 58, 62 + // 35, 39, 43, 47 -- 51, 55, 59, 63 + vsublh8(store2.val[0], a.val[0], b.val[0]); + vsublh8(store2.val[1], a.val[1], b.val[1]); + vsublh8(store2.val[2], a.val[2], b.val[2]); + vsublh8(store2.val[3], a.val[3], b.val[3]); + + vstore4(&s[i], store1); + vstore4(&s[i + 32], store2); + } +} + + +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { + neon_cbd4(s, buf); +} diff --git a/src/kem/saber/pqclean_saber_aarch64/cbd.h b/src/kem/saber/pqclean_saber_aarch64/cbd.h new file mode 100644 index 0000000000..6e5c360cb3 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/cbd.h @@ -0,0 +1,17 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + +#include "SABER_params.h" +#include + +#define cbd SABER_NAMESPACE(cbd) +void cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/fips202x2.c b/src/kem/saber/pqclean_saber_aarch64/fips202x2.c new file mode 100644 index 0000000000..3924900e9e --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/fips202x2.c @@ -0,0 +1,646 @@ +#include "fips202x2.h" +#include +#include + + +#define NROUNDS 24 + +// Define NEON operation +// c = load(ptr) +#define vload(ptr) vld1q_u64(ptr); +// ptr <= c; +#define vstore(ptr, c) vst1q_u64(ptr, c); +// c = a ^ b +#define vxor(c, a, b) c = veorq_u64(a, b); +// Rotate by n bit ((a << offset) ^ (a >> (64-offset))) +#define vROL(out, a, offset) \ + (out) = vshlq_n_u64(a, offset); \ + (out) = vsriq_n_u64(out, a, 64 - (offset)); +// Xor chain: out = a ^ b ^ c ^ d ^ e +#define vXOR4(out, a, b, c, d, e) \ + (out) = veorq_u64(a, b); \ + (out) = veorq_u64(out, c); \ + (out) = veorq_u64(out, d); \ + (out) = veorq_u64(out, e); +// Not And c = ~a & b +// #define vbic(c, a, b) c = vbicq_u64(b, a); +// Xor Not And: out = a ^ ( (~b) & c) +#define vXNA(out, a, b, c) \ + (out) = vbicq_u64(c, b); \ + (out) = veorq_u64(out, a); +// Rotate by 1 bit, then XOR: a ^ ROL(b): SHA1 instruction, not support +#define vrxor(c, a, b) c = vrax1q_u64(a, b); +// End Define + +/* Keccak round constants */ +static const uint64_t neon_KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +/************************************************* +* Name: KeccakF1600_StatePermutex2 +* +* Description: The Keccak F1600 Permutation +* +* Arguments: - uint64_t *state: pointer to input/output Keccak state +**************************************************/ +static inline +void KeccakF1600_StatePermutex2(v128 state[25]) { + v128 Aba, Abe, Abi, Abo, Abu; + v128 Aga, Age, Agi, Ago, Agu; + v128 Aka, Ake, Aki, Ako, Aku; + v128 Ama, Ame, Ami, Amo, Amu; + v128 Asa, Ase, Asi, Aso, Asu; + v128 BCa, BCe, BCi, BCo, BCu; // tmp + v128 Da, De, Di, Do, Du; // D + v128 Eba, Ebe, Ebi, Ebo, Ebu; + v128 Ega, Ege, Egi, Ego, Egu; + v128 Eka, Eke, Eki, Eko, Eku; + v128 Ema, Eme, Emi, Emo, Emu; + v128 Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[0]; + Abe = state[1]; + Abi = state[2]; + Abo = state[3]; + Abu = state[4]; + Aga = state[5]; + Age = state[6]; + Agi = state[7]; + Ago = state[8]; + Agu = state[9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for (int round = 0; round < NROUNDS; round += 2) { + // prepareTheta + vXOR4(BCa, Aba, Aga, Aka, Ama, Asa); + vXOR4(BCe, Abe, Age, Ake, Ame, Ase); + vXOR4(BCi, Abi, Agi, Aki, Ami, Asi); + vXOR4(BCo, Abo, Ago, Ako, Amo, Aso); + vXOR4(BCu, Abu, Agu, Aku, Amu, Asu); + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Aba, Aba, Da); + vxor(Age, Age, De); + vROL(BCe, Age, 44); + vxor(Aki, Aki, Di); + vROL(BCi, Aki, 43); + vxor(Amo, Amo, Do); + vROL(BCo, Amo, 21); + vxor(Asu, Asu, Du); + vROL(BCu, Asu, 14); + vXNA(Eba, Aba, BCe, BCi); + vxor(Eba, Eba, vdupq_n_u64(neon_KeccakF_RoundConstants[round])); + vXNA(Ebe, BCe, BCi, BCo); + vXNA(Ebi, BCi, BCo, BCu); + vXNA(Ebo, BCo, BCu, Aba); + vXNA(Ebu, BCu, Aba, BCe); + + vxor(Abo, Abo, Do); + vROL(BCa, Abo, 28); + vxor(Agu, Agu, Du); + vROL(BCe, Agu, 20); + vxor(Aka, Aka, Da); + vROL(BCi, Aka, 3); + vxor(Ame, Ame, De); + vROL(BCo, Ame, 45); + vxor(Asi, Asi, Di); + vROL(BCu, Asi, 61); + vXNA(Ega, BCa, BCe, BCi); + vXNA(Ege, BCe, BCi, BCo); + vXNA(Egi, BCi, BCo, BCu); + vXNA(Ego, BCo, BCu, BCa); + vXNA(Egu, BCu, BCa, BCe); + + vxor(Abe, Abe, De); + vROL(BCa, Abe, 1); + vxor(Agi, Agi, Di); + vROL(BCe, Agi, 6); + vxor(Ako, Ako, Do); + vROL(BCi, Ako, 25); + vxor(Amu, Amu, Du); + vROL(BCo, Amu, 8); + vxor(Asa, Asa, Da); + vROL(BCu, Asa, 18); + vXNA(Eka, BCa, BCe, BCi); + vXNA(Eke, BCe, BCi, BCo); + vXNA(Eki, BCi, BCo, BCu); + vXNA(Eko, BCo, BCu, BCa); + vXNA(Eku, BCu, BCa, BCe); + + vxor(Abu, Abu, Du); + vROL(BCa, Abu, 27); + vxor(Aga, Aga, Da); + vROL(BCe, Aga, 36); + vxor(Ake, Ake, De); + vROL(BCi, Ake, 10); + vxor(Ami, Ami, Di); + vROL(BCo, Ami, 15); + vxor(Aso, Aso, Do); + vROL(BCu, Aso, 56); + vXNA(Ema, BCa, BCe, BCi); + vXNA(Eme, BCe, BCi, BCo); + vXNA(Emi, BCi, BCo, BCu); + vXNA(Emo, BCo, BCu, BCa); + vXNA(Emu, BCu, BCa, BCe); + + vxor(Abi, Abi, Di); + vROL(BCa, Abi, 62); + vxor(Ago, Ago, Do); + vROL(BCe, Ago, 55); + vxor(Aku, Aku, Du); + vROL(BCi, Aku, 39); + vxor(Ama, Ama, Da); + vROL(BCo, Ama, 41); + vxor(Ase, Ase, De); + vROL(BCu, Ase, 2); + vXNA(Esa, BCa, BCe, BCi); + vXNA(Ese, BCe, BCi, BCo); + vXNA(Esi, BCi, BCo, BCu); + vXNA(Eso, BCo, BCu, BCa); + vXNA(Esu, BCu, BCa, BCe); + + // Next Round + + // prepareTheta + vXOR4(BCa, Eba, Ega, Eka, Ema, Esa); + vXOR4(BCe, Ebe, Ege, Eke, Eme, Ese); + vXOR4(BCi, Ebi, Egi, Eki, Emi, Esi); + vXOR4(BCo, Ebo, Ego, Eko, Emo, Eso); + vXOR4(BCu, Ebu, Egu, Eku, Emu, Esu); + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + vROL(Da, BCe, 1); + vxor(Da, BCu, Da); + vROL(De, BCi, 1); + vxor(De, BCa, De); + vROL(Di, BCo, 1); + vxor(Di, BCe, Di); + vROL(Do, BCu, 1); + vxor(Do, BCi, Do); + vROL(Du, BCa, 1); + vxor(Du, BCo, Du); + + vxor(Eba, Eba, Da); + vxor(Ege, Ege, De); + vROL(BCe, Ege, 44); + vxor(Eki, Eki, Di); + vROL(BCi, Eki, 43); + vxor(Emo, Emo, Do); + vROL(BCo, Emo, 21); + vxor(Esu, Esu, Du); + vROL(BCu, Esu, 14); + vXNA(Aba, Eba, BCe, BCi); + vxor(Aba, Aba, vdupq_n_u64(neon_KeccakF_RoundConstants[round + 1])); + vXNA(Abe, BCe, BCi, BCo); + vXNA(Abi, BCi, BCo, BCu); + vXNA(Abo, BCo, BCu, Eba); + vXNA(Abu, BCu, Eba, BCe); + + vxor(Ebo, Ebo, Do); + vROL(BCa, Ebo, 28); + vxor(Egu, Egu, Du); + vROL(BCe, Egu, 20); + vxor(Eka, Eka, Da); + vROL(BCi, Eka, 3); + vxor(Eme, Eme, De); + vROL(BCo, Eme, 45); + vxor(Esi, Esi, Di); + vROL(BCu, Esi, 61); + vXNA(Aga, BCa, BCe, BCi); + vXNA(Age, BCe, BCi, BCo); + vXNA(Agi, BCi, BCo, BCu); + vXNA(Ago, BCo, BCu, BCa); + vXNA(Agu, BCu, BCa, BCe); + + vxor(Ebe, Ebe, De); + vROL(BCa, Ebe, 1); + vxor(Egi, Egi, Di); + vROL(BCe, Egi, 6); + vxor(Eko, Eko, Do); + vROL(BCi, Eko, 25); + vxor(Emu, Emu, Du); + vROL(BCo, Emu, 8); + vxor(Esa, Esa, Da); + vROL(BCu, Esa, 18); + vXNA(Aka, BCa, BCe, BCi); + vXNA(Ake, BCe, BCi, BCo); + vXNA(Aki, BCi, BCo, BCu); + vXNA(Ako, BCo, BCu, BCa); + vXNA(Aku, BCu, BCa, BCe); + + vxor(Ebu, Ebu, Du); + vROL(BCa, Ebu, 27); + vxor(Ega, Ega, Da); + vROL(BCe, Ega, 36); + vxor(Eke, Eke, De); + vROL(BCi, Eke, 10); + vxor(Emi, Emi, Di); + vROL(BCo, Emi, 15); + vxor(Eso, Eso, Do); + vROL(BCu, Eso, 56); + vXNA(Ama, BCa, BCe, BCi); + vXNA(Ame, BCe, BCi, BCo); + vXNA(Ami, BCi, BCo, BCu); + vXNA(Amo, BCo, BCu, BCa); + vXNA(Amu, BCu, BCa, BCe); + + vxor(Ebi, Ebi, Di); + vROL(BCa, Ebi, 62); + vxor(Ego, Ego, Do); + vROL(BCe, Ego, 55); + vxor(Eku, Eku, Du); + vROL(BCi, Eku, 39); + vxor(Ema, Ema, Da); + vROL(BCo, Ema, 41); + vxor(Ese, Ese, De); + vROL(BCu, Ese, 2); + vXNA(Asa, BCa, BCe, BCi); + vXNA(Ase, BCe, BCi, BCo); + vXNA(Asi, BCi, BCo, BCu); + vXNA(Aso, BCo, BCu, BCa); + vXNA(Asu, BCu, BCa, BCe); + } + + state[0] = Aba; + state[1] = Abe; + state[2] = Abi; + state[3] = Abo; + state[4] = Abu; + state[5] = Aga; + state[6] = Age; + state[7] = Agi; + state[8] = Ago; + state[9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; +} + +/************************************************* +* Name: keccakx2_absorb +* +* Description: Absorb step of Keccak; +* non-incremental, starts by zeroeing the state. +* +* Arguments: - uint64_t *s: pointer to (uninitialized) output Keccak state +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - const uint8_t *m: pointer to input to be absorbed into s +* - size_t mlen: length of input in bytes +* - uint8_t p: domain-separation byte for different +* Keccak-derived functions +**************************************************/ +static +void keccakx2_absorb(v128 s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen, + uint8_t p) { + size_t i, pos = 0; + + // Declare SIMD registers + v128 tmp, mask; + uint64x1_t a, b; + uint64x2_t a1, b1, atmp1, btmp1; + uint64x2x2_t a2, b2, atmp2, btmp2; + // End + + for (i = 0; i < 25; ++i) { + s[i] = vdupq_n_u64(0); + } + + // Load in0[i] to register, then in1[i] to register, exchange them + while (inlen >= r) { + for (i = 0; i < r / 8 - 1; i += 4) { + a2 = vld1q_u64_x2((uint64_t *)&in0[pos]); + b2 = vld1q_u64_x2((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp2.val[0] = vzip1q_u64(a2.val[0], b2.val[0]); + atmp2.val[1] = vzip1q_u64(a2.val[1], b2.val[1]); + // AC = zip2(AB and CD) + btmp2.val[0] = vzip2q_u64(a2.val[0], b2.val[0]); + btmp2.val[1] = vzip2q_u64(a2.val[1], b2.val[1]); + + vxor(s[i + 0], s[i + 0], atmp2.val[0]); + vxor(s[i + 1], s[i + 1], btmp2.val[0]); + vxor(s[i + 2], s[i + 2], atmp2.val[1]); + vxor(s[i + 3], s[i + 3], btmp2.val[1]); + + pos += 8 * 2 * 2; + } + // Last iteration + i = r / 8 - 1; + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + pos += 8; + + KeccakF1600_StatePermutex2(s); + inlen -= r; + } + + i = 0; + while (inlen >= 16) { + a1 = vld1q_u64((uint64_t *)&in0[pos]); + b1 = vld1q_u64((uint64_t *)&in1[pos]); + // BD = zip1(AB and CD) + atmp1 = vzip1q_u64(a1, b1); + // AC = zip2(AB and CD) + btmp1 = vzip2q_u64(a1, b1); + + vxor(s[i + 0], s[i + 0], atmp1); + vxor(s[i + 1], s[i + 1], btmp1); + + i += 2; + pos += 8 * 2; + inlen -= 8 * 2; + } + + if (inlen >= 8) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + vxor(s[i], s[i], tmp); + + i++; + pos += 8; + inlen -= 8; + } + + if (inlen) { + a = vld1_u64((uint64_t *)&in0[pos]); + b = vld1_u64((uint64_t *)&in1[pos]); + tmp = vcombine_u64(a, b); + mask = vdupq_n_u64((1ULL << (8 * inlen)) - 1); + tmp = vandq_u64(tmp, mask); + vxor(s[i], s[i], tmp); + } + + tmp = vdupq_n_u64((uint64_t)p << (8 * inlen)); + vxor(s[i], s[i], tmp); + + mask = vdupq_n_u64(1ULL << 63); + vxor(s[r / 8 - 1], s[r / 8 - 1], mask); +} + +/************************************************* +* Name: keccak_squeezeblocks +* +* Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each. +* Modifies the state. Can be called multiple times to keep +* squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed (written to h) +* - unsigned int r: rate in bytes (e.g., 168 for SHAKE128) +* - uint64_t *s: pointer to input/output Keccak state +**************************************************/ +static +void keccakx2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + unsigned int r, + v128 s[25]) { + unsigned int i; + + uint64x1_t a, b; + uint64x2x2_t a2, b2; + + while (nblocks > 0) { + KeccakF1600_StatePermutex2(s); + + for (i = 0; i < r / 8 - 1; i += 4) { + a2.val[0] = vuzp1q_u64(s[i], s[i + 1]); + b2.val[0] = vuzp2q_u64(s[i], s[i + 1]); + a2.val[1] = vuzp1q_u64(s[i + 2], s[i + 3]); + b2.val[1] = vuzp2q_u64(s[i + 2], s[i + 3]); + vst1q_u64_x2((uint64_t *)out0, a2); + vst1q_u64_x2((uint64_t *)out1, b2); + + out0 += 32; + out1 += 32; + } + + i = r / 8 - 1; + // Last iteration + a = vget_low_u64(s[i]); + b = vget_high_u64(s[i]); + vst1_u64((uint64_t *)out0, a); + vst1_u64((uint64_t *)out1, b); + + out0 += 8; + out1 += 8; + + --nblocks; + } +} + +/************************************************* +* Name: shake128x2_absorb +* +* Description: Absorb step of the SHAKE128 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE128_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake128_squeezeblocks +* +* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of +* SHAKE128_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE128_RATE, state->s); +} + +/************************************************* +* Name: shake256_absorb +* +* Description: Absorb step of the SHAKE256 XOF. +* non-incremental, starts by zeroeing the state. +* +* Arguments: - keccakx2_state *s: pointer to (uninitialized) output Keccak state +* - const uint8_t *in: pointer to input to be absorbed into s +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + keccakx2_absorb(state->s, SHAKE256_RATE, in0, in1, inlen, 0x1F); +} + +/************************************************* +* Name: shake256_squeezeblocks +* +* Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of +* SHAKE256_RATE bytes each. Modifies the state. Can be called +* multiple times to keep squeezing, i.e., is incremental. +* +* Arguments: - uint8_t *out: pointer to output blocks +* - size_t nblocks: number of blocks to be squeezed +* (written to output) +* - keccakx2_state *s: pointer to input/output Keccak state +**************************************************/ +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state) { + keccakx2_squeezeblocks(out0, out1, nblocks, SHAKE256_RATE, state->s); +} + +/************************************************* +* Name: shake128 +* +* Description: SHAKE128 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[2][SHAKE128_RATE]; + keccakx2_state state; + + shake128x2_absorb(&state, in0, in1, inlen); + shake128x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + shake128x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} + +/************************************************* +* Name: shake256 +* +* Description: SHAKE256 XOF with non-incremental API +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: requested output length in bytes +* - const uint8_t *in: pointer to input +* - size_t inlen: length of input in bytes +**************************************************/ +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[2][SHAKE256_RATE]; + keccakx2_state state; + + shake256x2_absorb(&state, in0, in1, inlen); + shake256x2_squeezeblocks(out0, out1, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + shake256x2_squeezeblocks(t[0], t[1], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + } + } +} diff --git a/src/kem/saber/pqclean_saber_aarch64/fips202x2.h b/src/kem/saber/pqclean_saber_aarch64/fips202x2.h new file mode 100644 index 0000000000..11579f3015 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/fips202x2.h @@ -0,0 +1,54 @@ +#ifndef FIPS202X2_H +#define FIPS202X2_H + +#include "SABER_params.h" +#include +#include +#include "fips202.h" +typedef uint64x2_t v128; + +typedef struct { + v128 s[25]; +} keccakx2_state; + + +#define shake128x2_absorb SABER_NAMESPACE(shake128x2_absorb) +void shake128x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake128x2_squeezeblocks SABER_NAMESPACE(shake128x2_squeezeblocks) +void shake128x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake256x2_absorb SABER_NAMESPACE(shake256x2_absorb) +void shake256x2_absorb(keccakx2_state *state, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2_squeezeblocks SABER_NAMESPACE(shake256x2_squeezeblocks) +void shake256x2_squeezeblocks(uint8_t *out0, + uint8_t *out1, + size_t nblocks, + keccakx2_state *state); + +#define shake128x2 SABER_NAMESPACE(shake128x2) +void shake128x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); + +#define shake256x2 SABER_NAMESPACE(shake256x2) +void shake256x2(uint8_t *out0, + uint8_t *out1, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + size_t inlen); +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/kem.c b/src/kem/saber/pqclean_saber_aarch64/kem.c new file mode 100644 index 0000000000..4d8603096b --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/kem.c @@ -0,0 +1,84 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "fips202.h" +#include "kem.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include + +int PQCLEAN_SABER_AARCH64_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + int i; + + indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_SABER_AARCH64_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) { + + unsigned char kr[64]; // Will contain key, coins + unsigned char buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + indcpa_kem_enc(buf, kr + 32, pk, c); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_SABER_AARCH64_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) { + int i, fail; + unsigned char cmp[SABER_BYTES_CCA_DEC]; + unsigned char buf[64]; + unsigned char kr[64]; // Will contain key, coins + const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + indcpa_kem_dec(sk, c, buf); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + indcpa_kem_enc(buf, kr + 32, pk, cmp); + + fail = verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/src/kem/saber/pqclean_saber_aarch64/kem.h b/src/kem/saber/pqclean_saber_aarch64/kem.h new file mode 100644 index 0000000000..657e350d3c --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/kem.h @@ -0,0 +1,17 @@ +#ifndef KEM_H +#define KEM_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include + +int PQCLEAN_SABER_AARCH64_crypto_kem_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_SABER_AARCH64_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk); +int PQCLEAN_SABER_AARCH64_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk); + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/macros.i b/src/kem/saber/pqclean_saber_aarch64/macros.i new file mode 100644 index 0000000000..3773778bcc --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/macros.i @@ -0,0 +1,57 @@ + +#ifndef MACROS_S +#define MACROS_S + +#include "macros_common.i" + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro wrap_4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3, dS, qS, dD + + \mulacc \l0\dD, \a0\dS, \b0\dS + \mulacc2 \h0\dD, \a0\qS, \b0\qS + \mulacc \l1\dD, \a0\dS, \b1\dS + \mulacc2 \h1\dD, \a0\qS, \b1\qS + \mulacc \l2\dD, \a0\dS, \b2\dS + \mulacc2 \h2\dD, \a0\qS, \b2\qS + \mulacc \l3\dD, \a0\dS, \b3\dS + \mulacc2 \h3\dD, \a0\qS, \b3\qS + +.endm + +.macro _4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_4x4_asymmetric \mulacc, \mulacc2, \a0, \b0, \b1, \b2, \b3, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .2S, .4S, .2D +.endm + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_add_sub \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm + +#endif + diff --git a/src/kem/saber/pqclean_saber_aarch64/macros.inc b/src/kem/saber/pqclean_saber_aarch64/macros.inc new file mode 100644 index 0000000000..88c3675f29 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/macros.inc @@ -0,0 +1,57 @@ + +#ifndef MACROS_S +#define MACROS_S + +#include "macros_common.inc" + +.macro dq_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_top \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1 + wrap_dX_butterfly_bot \a0, \a1, \b0, \b1, \t0, \t1, \mod, \z0, \l0, \h0, \z1, \l1, \h1, .4S, .S +.endm + +.macro dq_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_dX_butterfly_mixed \a0, \a1, \b0, \b1, \t0, \t1, \a2, \a3, \b2, \b3, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_top \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3 + wrap_qX_butterfly_bot \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, .4S, .S +.endm + +.macro qq_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7 + wrap_qX_butterfly_mixed \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, \t0, \t1, \t2, \t3, \a4, \a5, \a6, \a7, \b4, \b5, \b6, \b7, \t4, \t5, \t6, \t7, \mod, \z0, \l0, \h0, \z1, \l1, \h1, \z2, \l2, \h2, \z3, \l3, \h3, \z4, \l4, \h4, \z5, \l5, \h5, \z6, \l6, \h6, \z7, \l7, \h7, .4S, .S +.endm + +.macro wrap_4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3, dS, qS, dD + + \mulacc \l0\dD, \a0\dS, \b0\dS + \mulacc2 \h0\dD, \a0\qS, \b0\qS + \mulacc \l1\dD, \a0\dS, \b1\dS + \mulacc2 \h1\dD, \a0\qS, \b1\qS + \mulacc \l2\dD, \a0\dS, \b2\dS + \mulacc2 \h2\dD, \a0\qS, \b2\qS + \mulacc \l3\dD, \a0\dS, \b3\dS + \mulacc2 \h3\dD, \a0\qS, \b3\qS + +.endm + +.macro _4x4_asymmetric mulacc, mulacc2, a0, b0, b1, b2, b3, l0, h0, l1, h1, l2, h2, l3, h3 + wrap_4x4_asymmetric \mulacc, \mulacc2, \a0, \b0, \b1, \b2, \b3, \l0, \h0, \l1, \h1, \l2, \h2, \l3, \h3, .2S, .4S, .2D +.endm + +.macro qq_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q + wrap_qX_montgomery \c0, \c1, \c2, \c3, \l0, \l1, \l2, \l3, \h0, \h1, \h2, \h3, \t0, \t1, \t2, \t3, \Qprime, \Q, .2S, .4S, .2D +.endm + +.macro qq_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3 + wrap_qX_add_sub \s0, \s1, \s2, \s3, \t0, \t1, \t2, \t3, \a0, \a1, \a2, \a3, \b0, \b1, \b2, \b3, .4S +.endm + +#endif + diff --git a/src/kem/saber/pqclean_saber_aarch64/macros_common.i b/src/kem/saber/pqclean_saber_aarch64/macros_common.i new file mode 100644 index 0000000000..26e7cbb5da --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/macros_common.i @@ -0,0 +1,434 @@ + +#ifndef MACROS_COMMON +#define MACROS_COMMON + +// for ABI + +.macro push_all + + sub sp, sp, #(16*9) + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*6] + stp d12, d13, [sp, #16*7] + stp d14, d15, [sp, #16*8] + +.endm + +.macro pop_all + + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp d10, d11, [sp, #16*6] + ldp d12, d13, [sp, #16*7] + ldp d14, d15, [sp, #16*8] + add sp, sp, #(16*9) + +.endm + +// vector-scalar butterflies + +.macro wrap_dX_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + sub \b2\wX, \a2\wX, \t2\wX + sub \b3\wX, \a3\wX, \t3\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + add \a2\wX, \a2\wX, \t2\wX + add \a3\wX, \a3\wX, \t3\wX + +.endm + +.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b4\wX, \a4\wX, \t4\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b5\wX, \a5\wX, \t5\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b6\wX, \a6\wX, \t6\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + sub \b7\wX, \a7\wX, \t7\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a4\wX, \a4\wX, \t4\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a5\wX, \a5\wX, \t5\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a6\wX, \a6\wX, \t6\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + add \a7\wX, \a7\wX, \t7\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +// vector-vector butterflies + +.macro wrap_dX_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + mul \t1\wX, \b1\wX, \h1\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +// vector-scalar Barrett reduction + +.macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// vector-vector Barrett reduction + +.macro wrap_qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\wX + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\wX + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\wX + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// Montgomery multiplication + +.macro wrap_qX_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \b0\wX, \t0\wX, \z0\nX[\h0] + mul \b1\wX, \t1\wX, \z1\nX[\h1] + mul \b2\wX, \t2\wX, \z2\nX[\h2] + mul \b3\wX, \t3\wX, \z3\nX[\h3] + + sqrdmulh \t0\wX, \t0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \t1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \t2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \t3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +// Montgomery reduction with long + +.macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX + + uzp1 \t0\wX, \l0\wX, \h0\wX + uzp1 \t1\wX, \l1\wX, \h1\wX + uzp1 \t2\wX, \l2\wX, \h2\wX + uzp1 \t3\wX, \l3\wX, \h3\wX + + mul \t0\wX, \t0\wX, \Qprime\wX + mul \t1\wX, \t1\wX, \Qprime\wX + mul \t2\wX, \t2\wX, \Qprime\wX + mul \t3\wX, \t3\wX, \Qprime\wX + + smlal \l0\dwX, \t0\lX, \Q\lX + smlal2 \h0\dwX, \t0\wX, \Q\wX + smlal \l1\dwX, \t1\lX, \Q\lX + smlal2 \h1\dwX, \t1\wX, \Q\wX + smlal \l2\dwX, \t2\lX, \Q\lX + smlal2 \h2\dwX, \t2\wX, \Q\wX + smlal \l3\dwX, \t3\lX, \Q\lX + smlal2 \h3\dwX, \t3\wX, \Q\wX + + uzp2 \c0\wX, \l0\wX, \h0\wX + uzp2 \c1\wX, \l1\wX, \h1\wX + uzp2 \c2\wX, \l2\wX, \h2\wX + uzp2 \c3\wX, \l3\wX, \h3\wX + +.endm + +// add_sub, sub_add + +.macro wrap_qX_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + add \s0\wX, \a0\wX, \b0\wX + sub \t0\wX, \a0\wX, \b0\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s3\wX, \a3\wX, \b3\wX + sub \t3\wX, \a3\wX, \b3\wX + +.endm + +.macro wrap_qX_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + sub \t0\wX, \a0\wX, \b0\wX + add \s0\wX, \a0\wX, \b0\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t3\wX, \a3\wX, \b3\wX + add \s3\wX, \a3\wX, \b3\wX + +.endm + + +#endif + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/macros_common.inc b/src/kem/saber/pqclean_saber_aarch64/macros_common.inc new file mode 100644 index 0000000000..26e7cbb5da --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/macros_common.inc @@ -0,0 +1,434 @@ + +#ifndef MACROS_COMMON +#define MACROS_COMMON + +// for ABI + +.macro push_all + + sub sp, sp, #(16*9) + stp x19, x20, [sp, #16*0] + stp x21, x22, [sp, #16*1] + stp x23, x24, [sp, #16*2] + stp x25, x26, [sp, #16*3] + stp x27, x28, [sp, #16*4] + stp d8, d9, [sp, #16*5] + stp d10, d11, [sp, #16*6] + stp d12, d13, [sp, #16*7] + stp d14, d15, [sp, #16*8] + +.endm + +.macro pop_all + + ldp x19, x20, [sp, #16*0] + ldp x21, x22, [sp, #16*1] + ldp x23, x24, [sp, #16*2] + ldp x25, x26, [sp, #16*3] + ldp x27, x28, [sp, #16*4] + ldp d8, d9, [sp, #16*5] + ldp d10, d11, [sp, #16*6] + ldp d12, d13, [sp, #16*7] + ldp d14, d15, [sp, #16*8] + add sp, sp, #(16*9) + +.endm + +// vector-scalar butterflies + +.macro wrap_dX_butterfly_top a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_bot a0, a1, b0, b1, t0, t1, mod, z0, l0, h0, z1, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_top a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + mul \t1\wX, \b1\wX, \z1\nX[\h1] + mul \t2\wX, \b2\wX, \z2\nX[\h2] + mul \t3\wX, \b3\wX, \z3\nX[\h3] + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_bot a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + sub \b2\wX, \a2\wX, \t2\wX + sub \b3\wX, \a3\wX, \t3\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + add \a2\wX, \a2\wX, \t2\wX + add \a3\wX, \a3\wX, \t3\wX + +.endm + +.macro wrap_qX_butterfly_mixed a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t4\wX, \b4\wX, \z4\nX[\h4] + sub \b1\wX, \a1\wX, \t1\wX + mul \t5\wX, \b5\wX, \z5\nX[\h5] + sub \b2\wX, \a2\wX, \t2\wX + mul \t6\wX, \b6\wX, \z6\nX[\h6] + sub \b3\wX, \a3\wX, \t3\wX + mul \t7\wX, \b7\wX, \z7\nX[\h7] + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b4\wX, \b4\wX, \z4\nX[\l4] + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b5\wX, \b5\wX, \z5\nX[\l5] + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b6\wX, \b6\wX, \z6\nX[\l6] + add \a3\wX, \a3\wX, \t3\wX + sqrdmulh \b7\wX, \b7\wX, \z7\nX[\l7] + + mls \t4\wX, \b4\wX, \mod\nX[0] + mls \t5\wX, \b5\wX, \mod\nX[0] + mls \t6\wX, \b6\wX, \mod\nX[0] + mls \t7\wX, \b7\wX, \mod\nX[0] + +.endm + +.macro wrap_qX_butterfly_mixed_rev a0, a1, a2, a3, b0, b1, b2, b3, t0, t1, t2, t3, a4, a5, a6, a7, b4, b5, b6, b7, t4, t5, t6, t7, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, z4, l4, h4, z5, l5, h5, z6, l6, h6, z7, l7, h7, wX, nX + + mul \t0\wX, \b0\wX, \z0\nX[\h0] + sub \b4\wX, \a4\wX, \t4\wX + mul \t1\wX, \b1\wX, \z1\nX[\h1] + sub \b5\wX, \a5\wX, \t5\wX + mul \t2\wX, \b2\wX, \z2\nX[\h2] + sub \b6\wX, \a6\wX, \t6\wX + mul \t3\wX, \b3\wX, \z3\nX[\h3] + sub \b7\wX, \a7\wX, \t7\wX + + sqrdmulh \b0\wX, \b0\wX, \z0\nX[\l0] + add \a4\wX, \a4\wX, \t4\wX + sqrdmulh \b1\wX, \b1\wX, \z1\nX[\l1] + add \a5\wX, \a5\wX, \t5\wX + sqrdmulh \b2\wX, \b2\wX, \z2\nX[\l2] + add \a6\wX, \a6\wX, \t6\wX + sqrdmulh \b3\wX, \b3\wX, \z3\nX[\l3] + add \a7\wX, \a7\wX, \t7\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +// vector-vector butterflies + +.macro wrap_dX_butterfly_vec_top a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + mul \t1\wX, \b1\wX, \h1\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_bot a0, a1, b0, b1, t0, t1, mod, l0, h0, l1, h1, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + sub \b1\wX, \a1\wX, \t1\wX + + add \a0\wX, \a0\wX, \t0\wX + add \a1\wX, \a1\wX, \t1\wX + +.endm + +.macro wrap_dX_butterfly_vec_mixed a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + sub \b0\wX, \a0\wX, \t0\wX + mul \t2\wX, \b2\wX, \h2\wX + sub \b1\wX, \a1\wX, \t1\wX + mul \t3\wX, \b3\wX, \h3\wX + + add \a0\wX, \a0\wX, \t0\wX + sqrdmulh \b2\wX, \b2\wX, \l2\wX + add \a1\wX, \a1\wX, \t1\wX + sqrdmulh \b3\wX, \b3\wX, \l3\wX + + mls \t2\wX, \b2\wX, \mod\nX[0] + mls \t3\wX, \b3\wX, \mod\nX[0] + +.endm + +.macro wrap_dX_butterfly_vec_mixed_rev a0, a1, b0, b1, t0, t1, a2, a3, b2, b3, t2, t3, mod, l0, h0, l1, h1, l2, h2, l3, h3, wX, nX + + mul \t0\wX, \b0\wX, \h0\wX + sub \b2\wX, \a2\wX, \t2\wX + mul \t1\wX, \b1\wX, \h1\wX + sub \b3\wX, \a3\wX, \t3\wX + + sqrdmulh \b0\wX, \b0\wX, \l0\wX + add \a2\wX, \a2\wX, \t2\wX + sqrdmulh \b1\wX, \b1\wX, \l1\wX + add \a3\wX, \a3\wX, \t3\wX + + mls \t0\wX, \b0\wX, \mod\nX[0] + mls \t1\wX, \b1\wX, \mod\nX[0] + +.endm + +// vector-scalar Barrett reduction + +.macro wrap_qX_barrett a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oX_barrett a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\nX[0] + sqdmulh \t1\wX, \a1\wX, \barrett_const\nX[0] + sqdmulh \t2\wX, \a2\wX, \barrett_const\nX[0] + sqdmulh \t3\wX, \a3\wX, \barrett_const\nX[0] + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\nX[0] + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\nX[0] + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\nX[0] + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\nX[0] + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// vector-vector Barrett reduction + +.macro wrap_qo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + + srshr \t2\wX, \t2\wX, \shrv + mls \a0\wX, \t0\wX, \Q\wX + srshr \t3\wX, \t3\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + + mls \a2\wX, \t2\wX, \Q\wX + mls \a3\wX, \t3\wX, \Q\wX + +.endm + +.macro wrap_oo_barrett_vec a0, a1, a2, a3, t0, t1, t2, t3, a4, a5, a6, a7, t4, t5, t6, t7, barrett_const, shrv, Q, wX, nX + + sqdmulh \t0\wX, \a0\wX, \barrett_const\wX + sqdmulh \t1\wX, \a1\wX, \barrett_const\wX + sqdmulh \t2\wX, \a2\wX, \barrett_const\wX + sqdmulh \t3\wX, \a3\wX, \barrett_const\wX + + srshr \t0\wX, \t0\wX, \shrv + sqdmulh \t4\wX, \a4\wX, \barrett_const\wX + srshr \t1\wX, \t1\wX, \shrv + sqdmulh \t5\wX, \a5\wX, \barrett_const\wX + srshr \t2\wX, \t2\wX, \shrv + sqdmulh \t6\wX, \a6\wX, \barrett_const\wX + srshr \t3\wX, \t3\wX, \shrv + sqdmulh \t7\wX, \a7\wX, \barrett_const\wX + + mls \a0\wX, \t0\wX, \Q\wX + srshr \t4\wX, \t4\wX, \shrv + mls \a1\wX, \t1\wX, \Q\wX + srshr \t5\wX, \t5\wX, \shrv + mls \a2\wX, \t2\wX, \Q\wX + srshr \t6\wX, \t6\wX, \shrv + mls \a3\wX, \t3\wX, \Q\wX + srshr \t7\wX, \t7\wX, \shrv + + mls \a4\wX, \t4\wX, \Q\wX + mls \a5\wX, \t5\wX, \Q\wX + mls \a6\wX, \t6\wX, \Q\wX + mls \a7\wX, \t7\wX, \Q\wX + +.endm + +// Montgomery multiplication + +.macro wrap_qX_montgomery_mul b0, b1, b2, b3, t0, t1, t2, t3, mod, z0, l0, h0, z1, l1, h1, z2, l2, h2, z3, l3, h3, wX, nX + + mul \b0\wX, \t0\wX, \z0\nX[\h0] + mul \b1\wX, \t1\wX, \z1\nX[\h1] + mul \b2\wX, \t2\wX, \z2\nX[\h2] + mul \b3\wX, \t3\wX, \z3\nX[\h3] + + sqrdmulh \t0\wX, \t0\wX, \z0\nX[\l0] + sqrdmulh \t1\wX, \t1\wX, \z1\nX[\l1] + sqrdmulh \t2\wX, \t2\wX, \z2\nX[\l2] + sqrdmulh \t3\wX, \t3\wX, \z3\nX[\l3] + + mls \b0\wX, \t0\wX, \mod\nX[0] + mls \b1\wX, \t1\wX, \mod\nX[0] + mls \b2\wX, \t2\wX, \mod\nX[0] + mls \b3\wX, \t3\wX, \mod\nX[0] + +.endm + +// Montgomery reduction with long + +.macro wrap_qX_montgomery c0, c1, c2, c3, l0, l1, l2, l3, h0, h1, h2, h3, t0, t1, t2, t3, Qprime, Q, lX, wX, dwX + + uzp1 \t0\wX, \l0\wX, \h0\wX + uzp1 \t1\wX, \l1\wX, \h1\wX + uzp1 \t2\wX, \l2\wX, \h2\wX + uzp1 \t3\wX, \l3\wX, \h3\wX + + mul \t0\wX, \t0\wX, \Qprime\wX + mul \t1\wX, \t1\wX, \Qprime\wX + mul \t2\wX, \t2\wX, \Qprime\wX + mul \t3\wX, \t3\wX, \Qprime\wX + + smlal \l0\dwX, \t0\lX, \Q\lX + smlal2 \h0\dwX, \t0\wX, \Q\wX + smlal \l1\dwX, \t1\lX, \Q\lX + smlal2 \h1\dwX, \t1\wX, \Q\wX + smlal \l2\dwX, \t2\lX, \Q\lX + smlal2 \h2\dwX, \t2\wX, \Q\wX + smlal \l3\dwX, \t3\lX, \Q\lX + smlal2 \h3\dwX, \t3\wX, \Q\wX + + uzp2 \c0\wX, \l0\wX, \h0\wX + uzp2 \c1\wX, \l1\wX, \h1\wX + uzp2 \c2\wX, \l2\wX, \h2\wX + uzp2 \c3\wX, \l3\wX, \h3\wX + +.endm + +// add_sub, sub_add + +.macro wrap_qX_add_sub s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + add \s0\wX, \a0\wX, \b0\wX + sub \t0\wX, \a0\wX, \b0\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s3\wX, \a3\wX, \b3\wX + sub \t3\wX, \a3\wX, \b3\wX + +.endm + +.macro wrap_qX_sub_add s0, s1, s2, s3, t0, t1, t2, t3, a0, a1, a2, a3, b0, b1, b2, b3, wX + + sub \t0\wX, \a0\wX, \b0\wX + add \s0\wX, \a0\wX, \b0\wX + sub \t1\wX, \a1\wX, \b1\wX + add \s1\wX, \a1\wX, \b1\wX + sub \t2\wX, \a2\wX, \b2\wX + add \s2\wX, \a2\wX, \b2\wX + sub \t3\wX, \a3\wX, \b3\wX + add \s3\wX, \a3\wX, \b3\wX + +.endm + + +#endif + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/pack_unpack.c b/src/kem/saber/pqclean_saber_aarch64/pack_unpack.c new file mode 100644 index 0000000000..bd05156167 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/pack_unpack.c @@ -0,0 +1,179 @@ +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + + +#include "api.h" +#include "pack_unpack.h" +#include + +/* This function reduces its input mod T */ +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j; + const uint16_t *in = data; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 2; j++) { + out[0] = (uint8_t) ((in[0] & 0x0f) | (in[1] << 4)); + in += 2; + out += 1; + } +} + +/* This function does NOT reduce its output mod T */ +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]) { + PQCLEAN_SABER_AARCH64_asm_4_to_16(&(data[0]), &(bytes[0])); +} + +/* This function reduces its input mod q */ +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 8; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); + in += 8; + out += 13; + } +} + +/* This function sign-extends its output from q-bit to 16-bit. +This is needed by 16-bit NTTs */ +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]) { + size_t i; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int13_t { // bitfield struct to sign-extend q-bit to 16-bit. +signed int bits: + SABER_EQ; + } q0, q1, q2, q3, q4, q5, q6, q7; + + for (i = 0; i < SABER_N / 8; i++) { + q0.bits = (in[0]) | (in[1] << 8); + q1.bits = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + q2.bits = (in[3] >> 2) | (in[4] << 6); + q3.bits = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + q4.bits = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + q5.bits = (in[8] >> 1) | (in[9] << 7); + q6.bits = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + q7.bits = (in[11] >> 3) | (in[12] << 5); + out[0] = (int16_t)q0.bits; + out[1] = (int16_t)q1.bits; + out[2] = (int16_t)q2.bits; + out[3] = (int16_t)q3.bits; + out[4] = (int16_t)q4.bits; + out[5] = (int16_t)q5.bits; + out[6] = (int16_t)q6.bits; + out[7] = (int16_t)q7.bits; + in += 13; + out += 8; + } +} + +/* This function reduces its input mod p */ +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t i; + const uint16_t *in = data; + uint8_t *out = bytes; + for (i = 0; i < SABER_N / 4; i++) { + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); + in += 4; + out += 5; + } +} + +/* This function sign-extends its output from p-bit to 16-bit. +This is needed by the NTT */ +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]) { + size_t j; + const uint8_t *in = bytes; + int16_t *out = (int16_t *)data; + + struct int10_t { // bitfield struct to sign-extend p-bit to 16-bit. +signed int bits: + SABER_EP; + } p0, p1, p2, p3; + + for (j = 0; j < SABER_N / 4; j++) { + p0.bits = (in[0]) | (in[1] << 8); + p1.bits = (in[1] >> 2) | (in[2] << 6); + p2.bits = (in[2] >> 4) | (in[3] << 4); + p3.bits = (in[3] >> 6) | (in[4] << 2); + out[0] = (int16_t)p0.bits; + out[1] = (int16_t)p1.bits; + out[2] = (int16_t)p2.bits; + out[3] = (int16_t)p3.bits; + in += 5; + out += 4; + } +} + +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]) { + PQCLEAN_SABER_AARCH64_asm_1_to_16(&(data[0]), &(bytes[0])); +} + +/* This function reduces its input mod 2 */ +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + uint8_t byte; + for (j = 0; j < SABER_KEYBYTES; j++) { + byte = 0; + for (i = 0; i < 8; i++) { + byte |= ((data[j * 8 + i] & 0x01) << i); + } + bytes[j] = byte; + } +} + + + + + diff --git a/src/kem/saber/pqclean_saber_aarch64/pack_unpack.h b/src/kem/saber/pqclean_saber_aarch64/pack_unpack.h new file mode 100644 index 0000000000..9880718639 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/pack_unpack.h @@ -0,0 +1,52 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +/*============================================================================= +This file has been adapted from the implementation +(available at, Public Domain https://github.com/KULeuven-COSIC/SABER) +of "Saber: Module-LWR based key exchange, CPA-secure encryption and CCA-secure KEM" +by : Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, and Frederik Vercauteren +Jose Maria Bermudo Mera, Michiel Van Beirendonck, Andrea Basso. +=============================================================================*/ + +#include "SABER_params.h" +#include +#include + +extern void PQCLEAN_SABER_AARCH64_asm_1_to_16(void *, const void *); +extern void PQCLEAN_SABER_AARCH64_asm_4_to_16(void *, const void *); + +extern void PQCLEAN_SABER_AARCH64_asm_10_to_32(void *, const void *); +extern void PQCLEAN_SABER_AARCH64_asm_13_to_32(void *, const void *); +extern void PQCLEAN_SABER_AARCH64_asm_16_to_32(void *, const void *); + +#define POLT2BS SABER_NAMESPACE(POLT2BS) +void POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +#define BS2POLT SABER_NAMESPACE(BS2POLT) +void BS2POLT(const uint8_t bytes[SABER_SCALEBYTES_KEM], uint16_t data[SABER_N]); + +#define POLq2BS SABER_NAMESPACE(POLq2BS) +void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]); +#define POLp2BS SABER_NAMESPACE(POLp2BS) +void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]); + +#define BS2POLq SABER_NAMESPACE(BS2POLq) +void BS2POLq(const uint8_t bytes[SABER_POLYBYTES], uint16_t data[SABER_N]); +#define BS2POLp SABER_NAMESPACE(BS2POLp) +void BS2POLp(const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], uint16_t data[SABER_N]); + +#define POLVECq2BS SABER_NAMESPACE(POLVECq2BS) +void POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define POLVECp2BS SABER_NAMESPACE(POLVECp2BS) +void POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLVECq SABER_NAMESPACE(BS2POLVECq) +void BS2POLVECq(const uint8_t bytes[SABER_POLYVECBYTES], uint16_t data[SABER_L][SABER_N]); +#define BS2POLVECp SABER_NAMESPACE(BS2POLVECp) +void BS2POLVECp(const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], uint16_t data[SABER_L][SABER_N]); + +#define BS2POLmsg SABER_NAMESPACE(BS2POLmsg) +void BS2POLmsg(const uint8_t bytes[SABER_KEYBYTES], uint16_t data[SABER_N]); +#define POLmsg2BS SABER_NAMESPACE(POLmsg2BS) +void POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); + +#endif diff --git a/src/kem/saber/pqclean_saber_aarch64/verify.c b/src/kem/saber/pqclean_saber_aarch64/verify.c new file mode 100644 index 0000000000..87a7acc486 --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/verify.c @@ -0,0 +1,34 @@ +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "verify.h" +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +int verify(const unsigned char *a, const unsigned char *b, size_t len) { + uint64_t r; + size_t i; + r = 0; + + for (i = 0; i < len; i++) { + r |= a[i] ^ b[i]; + } + + r = (-r) >> 63; + return r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/src/kem/saber/pqclean_saber_aarch64/verify.h b/src/kem/saber/pqclean_saber_aarch64/verify.h new file mode 100644 index 0000000000..2a3aabe77d --- /dev/null +++ b/src/kem/saber/pqclean_saber_aarch64/verify.h @@ -0,0 +1,21 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ +#include "SABER_params.h" +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +#define verify SABER_NAMESPACE(verify) +int verify(const unsigned char *a, const unsigned char *b, size_t len); + +/* b = 1 means mov, b = 0 means don't mov*/ +#define cmov SABER_NAMESPACE(cmov) +void cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); + +#endif diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index 3d225b4c02..e84e2a7f15 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -171,10 +171,13 @@ #cmakedefine OQS_ENABLE_KEM_SABER 1 #cmakedefine OQS_ENABLE_KEM_saber_lightsaber 1 #cmakedefine OQS_ENABLE_KEM_saber_lightsaber_avx2 1 +#cmakedefine OQS_ENABLE_KEM_saber_lightsaber_aarch64 1 #cmakedefine OQS_ENABLE_KEM_saber_saber 1 #cmakedefine OQS_ENABLE_KEM_saber_saber_avx2 1 +#cmakedefine OQS_ENABLE_KEM_saber_saber_aarch64 1 #cmakedefine OQS_ENABLE_KEM_saber_firesaber 1 #cmakedefine OQS_ENABLE_KEM_saber_firesaber_avx2 1 +#cmakedefine OQS_ENABLE_KEM_saber_firesaber_aarch64 1 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1