aws · dkostic · Apr 30, 2024 · Apr 23, 2024 · Apr 24, 2024 · Apr 24, 2024
@@ -20,66 +20,20 @@
 S2N_BIGNUM_STUB_FUNC(void, bignum_mod_n25519, uint64_t z[4], uint64_t k, uint64_t *x)
 S2N_BIGNUM_STUB_FUNC(void, bignum_neg_p25519, uint64_t z[4], uint64_t x[4])
 S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
-S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519_alt, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
+S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519_selector, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
 S2N_BIGNUM_STUB_FUNC(void, edwards25519_encode, uint8_t z[32], uint64_t p[8])
 S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode, uint64_t z[8], const uint8_t c[32])
-S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode_alt, uint64_t z[8], const uint8_t c[32])
+S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode_selector, uint64_t z[8], const uint8_t c[32])
 S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase, uint64_t res[8],uint64_t scalar[4])
-S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase_alt, uint64_t res[8],uint64_t scalar[4])
+S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase_selector, uint64_t res[8],uint64_t scalar[4])
 S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
-S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble_alt, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
+S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble_selector, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
 S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
-S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte_alt, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
+S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte_selector, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
 S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte, uint8_t res[32], const uint8_t scalar[32])
-S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte_alt, uint8_t res[32], const uint8_t scalar[32])
+S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte_selector, uint8_t res[32], const uint8_t scalar[32])
 #endif // !defined(CURVE25519_S2N_BIGNUM_CAPABLE)
 
-// curve25519_s2n_bignum_use_no_alt_implementation returns 1 if the no_alt
-// s2n-bignum implementation should be used and 0 otherwise.
-//
-// Below is the decision logic for which assembly backend implementation
-// of x25519 s2n-bignum we should use if x25519 s2n-bignum capable. Currently,
-// we support the following implementations.
-//
-// x86_64:
-// - s2n-bignum-no-alt: hardware implementation using bmi2+adx instruction sets
-// - s2n-bignum-alt: hardware implementation using standard instructions
-//
-// aarch64:
-// - s2n-bignum-no-alt: hardware implementation for "low" multiplier throughput
-// - s2n-bignum-alt: hardware implementation for "high" multiplier throughput
-//
-// Through experiments we have found that:
-//
-// For x86_64: bmi+adc will almost always give a performance boost. So, here we
-// prefer s2n-bignum-no-alt over s2n-bignum-alt if the former is supported.
-// For aarch64: if a wide multiplier is supported, we prefer s2n-bignum-alt over
-// s2n-bignum-no-alt if the former is supported.
-// |curve25519_s2n_bignum_alt_capable| specifically looks to match CPUs that
-// have wide multipliers. this ensures that s2n-bignum-alt will only be used
-// on such CPUs.
-OPENSSL_INLINE int curve25519_s2n_bignum_use_no_alt_implementation(void);
-OPENSSL_INLINE int curve25519_s2n_bignum_use_no_alt_implementation(void) {
-#if defined(OPENSSL_X86_64)
- // For x86_64 the no_alt implementation is bmi2+adx. Prefer if available. 
- if (CRYPTO_is_BMI2_capable() == 1 && CRYPTO_is_ADX_capable() == 1) {
- return 1;
- } else {
- return 0;
- }
-#elif defined(OPENSSL_AARCH64)
- // For aarch64 the alt implementation is for wide multipliers. Prefer if
- // available.
- if (CRYPTO_is_ARMv8_wide_multiplier_capable() == 1) {
- return 0;
- } else {
- return 1;
- }
-#endif
- // Have to return some default value.
- return 0;
-}
-
 void x25519_scalar_mult_generic_s2n_bignum(
  uint8_t out_shared_key[X25519_SHARED_KEY_LEN],
  const uint8_t private_key[X25519_PRIVATE_KEY_LEN],
@@ -91,13 +45,9 @@ void x25519_scalar_mult_generic_s2n_bignum(
  private_key_internal_demask[31] &= 127;
  private_key_internal_demask[31] |= 64;
 
- if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
- curve25519_x25519_byte(out_shared_key, private_key_internal_demask,
- peer_public_value);
- } else {
- curve25519_x25519_byte_alt(out_shared_key, private_key_internal_demask,
- peer_public_value);
- }
+ curve25519_x25519_byte_selector(out_shared_key,
+ private_key_internal_demask,
+ peer_public_value);
 }
 
 void x25519_public_from_private_s2n_bignum(
@@ -110,11 +60,7 @@ void x25519_public_from_private_s2n_bignum(
  private_key_internal_demask[31] &= 127;
  private_key_internal_demask[31] |= 64;
 
- if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
- curve25519_x25519base_byte(out_public_value, private_key_internal_demask);
- } else {
- curve25519_x25519base_byte_alt(out_public_value, private_key_internal_demask);
- }
+ curve25519_x25519base_byte_selector(out_public_value, private_key_internal_demask);
 }
 
 void ed25519_public_key_from_hashed_seed_s2n_bignum(
@@ -125,29 +71,14 @@ void ed25519_public_key_from_hashed_seed_s2n_bignum(
  uint64_t uint64_hashed_seed[4] = {0};
  OPENSSL_memcpy(uint64_hashed_seed, az, 32);
 
- if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
- edwards25519_scalarmulbase(uint64_point, uint64_hashed_seed);
- } else {
- edwards25519_scalarmulbase_alt(uint64_point, uint64_hashed_seed);
- }
+ edwards25519_scalarmulbase_selector(uint64_point, uint64_hashed_seed);
 
  edwards25519_encode(out_public_key, uint64_point);
 }
 
 void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],
  uint8_t r[SHA512_DIGEST_LENGTH], const uint8_t *s, const uint8_t *A,
  const void *message, size_t message_len) {
-
- void (*scalarmulbase)(uint64_t res[8],uint64_t scalar[4]);
- void (*madd)(uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]);
-
- if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
- scalarmulbase = edwards25519_scalarmulbase;
- madd = bignum_madd_n25519;
- } else {
- scalarmulbase = edwards25519_scalarmulbase_alt;
- madd = bignum_madd_n25519_alt;
- }
 
  uint8_t k[SHA512_DIGEST_LENGTH] = {0};
  uint64_t R[8] = {0};
@@ -162,7 +93,7 @@ void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],
  bignum_mod_n25519(uint64_r, 8, uint64_r);
 
  // Compute [r]B.
- scalarmulbase(R, uint64_r);
+ edwards25519_scalarmulbase_selector(R, uint64_r);
  edwards25519_encode(out_sig, R);
 
  // Compute k = SHA512(R || A || message)
@@ -174,34 +105,22 @@ void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],
 
  // Compute S = r + k * s modulo the order of the base-point B.
  // out_sig = R || S
- madd(S, uint64_k, uint64_s, uint64_r);
+ bignum_madd_n25519_selector(S, uint64_k, uint64_s, uint64_r);
  OPENSSL_memcpy(out_sig + 32, S, 32);
 }
 
 int ed25519_verify_s2n_bignum(uint8_t R_computed_encoded[32],
  const uint8_t public_key[ED25519_PUBLIC_KEY_LEN], uint8_t R_expected[32],
  uint8_t S[32], const uint8_t *message, size_t message_len) {
 
- void (*scalarmuldouble)(uint64_t res[8], uint64_t scalar[4],
- uint64_t point[8], uint64_t bscalar[4]);
- uint64_t (*decode)(uint64_t z[8], const uint8_t c[32]);
-
- if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
- scalarmuldouble = edwards25519_scalarmuldouble;
- decode = edwards25519_decode;
- } else {
- scalarmuldouble = edwards25519_scalarmuldouble_alt;
- decode = edwards25519_decode_alt;
- }
-
  uint8_t k[SHA512_DIGEST_LENGTH] = {0};
  uint64_t uint64_k[8] = {0};
  uint64_t uint64_R[8] = {0};
  uint64_t uint64_S[4] = {0};
  uint64_t A[8] = {0};
 
  // Decode public key as A'.
- if (decode(A, public_key) != 0) {
+ if (edwards25519_decode_selector(A, public_key) != 0) {
  return 0;
  }
 
@@ -222,7 +141,7 @@ int ed25519_verify_s2n_bignum(uint8_t R_computed_encoded[32],
 
  // Compute R_have <- [S]B - [k]A'.
  OPENSSL_memcpy(uint64_S, S, 32);
- scalarmuldouble(uint64_R, uint64_k, A, uint64_S);
+ edwards25519_scalarmuldouble_selector(uint64_R, uint64_k, A, uint64_S);
  edwards25519_encode(R_computed_encoded, uint64_R);
 
  return 1;

@@ -76,47 +76,15 @@ static const p384_felem p384_felem_one = {
 
 #if defined(P384_USE_S2N_BIGNUM_FIELD_ARITH)
 
-#if defined(OPENSSL_X86_64)
-// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
-// for some of the functions. These instructions are not supported by
-// every x86 CPU so we have to check if they are available and in case
-// they are not we fallback to slightly slower but generic implementation.
-static inline uint8_t p384_use_s2n_bignum_alt(void) {
- return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
-}
-#else
-// On aarch64 platforms s2n-bignum has two implementations of certain
-// functions -- the default one and the alternative (suffixed _alt).
-// Depending on the architecture one version is faster than the other.
-// Generally, the "_alt" functions are faster on architectures with higher
-// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
-static inline uint8_t p384_use_s2n_bignum_alt(void) {
- return CRYPTO_is_ARMv8_wide_multiplier_capable();
-}
-#endif
-
 #define p384_felem_add(out, in0, in1) bignum_add_p384(out, in0, in1)
 #define p384_felem_sub(out, in0, in1) bignum_sub_p384(out, in0, in1)
 #define p384_felem_opp(out, in0) bignum_neg_p384(out, in0)
 #define p384_felem_to_bytes(out, in0) bignum_tolebytes_6(out, in0)
 #define p384_felem_from_bytes(out, in0) bignum_fromlebytes_6(out, in0)
-
-// The following four functions need bmi2 and adx support.
-#define p384_felem_mul(out, in0, in1) \
- if (p384_use_s2n_bignum_alt()) bignum_montmul_p384_alt(out, in0, in1); \
- else bignum_montmul_p384(out, in0, in1);
-
-#define p384_felem_sqr(out, in0) \
- if (p384_use_s2n_bignum_alt()) bignum_montsqr_p384_alt(out, in0); \
- else bignum_montsqr_p384(out, in0);
-
-#define p384_felem_to_mont(out, in0) \
- if (p384_use_s2n_bignum_alt()) bignum_tomont_p384_alt(out, in0); \
- else bignum_tomont_p384(out, in0);
-
-#define p384_felem_from_mont(out, in0) \
- if (p384_use_s2n_bignum_alt()) bignum_deamont_p384_alt(out, in0); \
- else bignum_deamont_p384(out, in0);
+#define p384_felem_to_mont(out, in0) bignum_tomont_p384_selector(out, in0)
+#define p384_felem_from_mont(out, in0) bignum_deamont_p384_selector(out, in0)
+#define p384_felem_mul(out, in0, in1) bignum_montmul_p384_selector(out, in0, in1)
+#define p384_felem_sqr(out, in0) bignum_montsqr_p384_selector(out, in0)
 
 static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
  return bignum_nonzero_6(in1);

@@ -77,40 +77,14 @@ static const p521_limb_t p521_felem_p[P521_NLIMBS] = {
  0xffffffffffffffff, 0xffffffffffffffff,
  0x1ff};
 
-#if defined(OPENSSL_X86_64)
-// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
-// for some of the functions. These instructions are not supported by
-// every x86 CPU so we have to check if they are available and in case
-// they are not we fallback to slightly slower but generic implementation.
-static inline uint8_t p521_use_s2n_bignum_alt(void) {
- return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
-}
-#else
-// On aarch64 platforms s2n-bignum has two implementations of certain
-// functions -- the default one and the alternative (suffixed _alt).
-// Depending on the architecture one version is faster than the other.
-// Generally, the "_alt" functions are faster on architectures with higher
-// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
-static inline uint8_t p521_use_s2n_bignum_alt(void) {
- return CRYPTO_is_ARMv8_wide_multiplier_capable();
-}
-#endif
-
 // s2n-bignum implementation of field arithmetic
 #define p521_felem_add(out, in0, in1) bignum_add_p521(out, in0, in1)
 #define p521_felem_sub(out, in0, in1) bignum_sub_p521(out, in0, in1)
 #define p521_felem_opp(out, in0) bignum_neg_p521(out, in0)
 #define p521_felem_to_bytes(out, in0) bignum_tolebytes_p521(out, in0)
 #define p521_felem_from_bytes(out, in0) bignum_fromlebytes_p521(out, in0)
-
-// The following two functions need bmi2 and adx support.
-#define p521_felem_mul(out, in0, in1) \
- if (p521_use_s2n_bignum_alt()) bignum_mul_p521_alt(out, in0, in1); \
- else bignum_mul_p521(out, in0, in1);
-
-#define p521_felem_sqr(out, in0) \
- if (p521_use_s2n_bignum_alt()) bignum_sqr_p521_alt(out, in0); \
- else bignum_sqr_p521(out, in0);
+#define p521_felem_mul(out, in0, in1) bignum_mul_p521_selector(out, in0, in1)
+#define p521_felem_sqr(out, in0) bignum_sqr_p521_selector(out, in0)
 
 #else // P521_USE_S2N_BIGNUM_FIELD_ARITH