Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Centralize handling of s2n-bignum alt/non-alt function selection #1547

Merged
merged 7 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 15 additions & 96 deletions crypto/curve25519/curve25519_s2n_bignum_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,66 +20,20 @@
S2N_BIGNUM_STUB_FUNC(void, bignum_mod_n25519, uint64_t z[4], uint64_t k, uint64_t *x)
S2N_BIGNUM_STUB_FUNC(void, bignum_neg_p25519, uint64_t z[4], uint64_t x[4])
S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need these anymore?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

guess not, removed

S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519_alt, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
S2N_BIGNUM_STUB_FUNC(void, bignum_madd_n25519_selector, uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_encode, uint8_t z[32], uint64_t p[8])
S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode, uint64_t z[8], const uint8_t c[32])
S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode_alt, uint64_t z[8], const uint8_t c[32])
S2N_BIGNUM_STUB_FUNC(uint64_t, edwards25519_decode_selector, uint64_t z[8], const uint8_t c[32])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase, uint64_t res[8],uint64_t scalar[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase_alt, uint64_t res[8],uint64_t scalar[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmulbase_selector, uint64_t res[8],uint64_t scalar[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble_alt, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
S2N_BIGNUM_STUB_FUNC(void, edwards25519_scalarmuldouble_selector, uint64_t res[8], uint64_t scalar[4], uint64_t point[8], uint64_t bscalar[4])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte_alt, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519_byte_selector, uint8_t res[32], const uint8_t scalar[32], const uint8_t point[32])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte, uint8_t res[32], const uint8_t scalar[32])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte_alt, uint8_t res[32], const uint8_t scalar[32])
S2N_BIGNUM_STUB_FUNC(void, curve25519_x25519base_byte_selector, uint8_t res[32], const uint8_t scalar[32])
#endif // !defined(CURVE25519_S2N_BIGNUM_CAPABLE)

// curve25519_s2n_bignum_use_no_alt_implementation returns 1 if the no_alt
// s2n-bignum implementation should be used and 0 otherwise.
//
// Below is the decision logic for which assembly backend implementation
// of x25519 s2n-bignum we should use if x25519 s2n-bignum capable. Currently,
// we support the following implementations.
//
// x86_64:
// - s2n-bignum-no-alt: hardware implementation using bmi2+adx instruction sets
// - s2n-bignum-alt: hardware implementation using standard instructions
//
// aarch64:
// - s2n-bignum-no-alt: hardware implementation for "low" multiplier throughput
// - s2n-bignum-alt: hardware implementation for "high" multiplier throughput
//
// Through experiments we have found that:
//
// For x86_64: bmi+adc will almost always give a performance boost. So, here we
// prefer s2n-bignum-no-alt over s2n-bignum-alt if the former is supported.
// For aarch64: if a wide multiplier is supported, we prefer s2n-bignum-alt over
// s2n-bignum-no-alt if the former is supported.
// |curve25519_s2n_bignum_alt_capable| specifically looks to match CPUs that
// have wide multipliers. this ensures that s2n-bignum-alt will only be used
// on such CPUs.
OPENSSL_INLINE int curve25519_s2n_bignum_use_no_alt_implementation(void);
OPENSSL_INLINE int curve25519_s2n_bignum_use_no_alt_implementation(void) {
#if defined(OPENSSL_X86_64)
// For x86_64 the no_alt implementation is bmi2+adx. Prefer if available.
if (CRYPTO_is_BMI2_capable() == 1 && CRYPTO_is_ADX_capable() == 1) {
return 1;
} else {
return 0;
}
#elif defined(OPENSSL_AARCH64)
// For aarch64 the alt implementation is for wide multipliers. Prefer if
// available.
if (CRYPTO_is_ARMv8_wide_multiplier_capable() == 1) {
return 0;
} else {
return 1;
}
#endif
// Have to return some default value.
return 0;
}

void x25519_scalar_mult_generic_s2n_bignum(
uint8_t out_shared_key[X25519_SHARED_KEY_LEN],
const uint8_t private_key[X25519_PRIVATE_KEY_LEN],
Expand All @@ -91,13 +45,9 @@ void x25519_scalar_mult_generic_s2n_bignum(
private_key_internal_demask[31] &= 127;
private_key_internal_demask[31] |= 64;

if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
curve25519_x25519_byte(out_shared_key, private_key_internal_demask,
peer_public_value);
} else {
curve25519_x25519_byte_alt(out_shared_key, private_key_internal_demask,
peer_public_value);
}
curve25519_x25519_byte_selector(out_shared_key,
private_key_internal_demask,
peer_public_value);
}

void x25519_public_from_private_s2n_bignum(
Expand All @@ -110,11 +60,7 @@ void x25519_public_from_private_s2n_bignum(
private_key_internal_demask[31] &= 127;
private_key_internal_demask[31] |= 64;

if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
curve25519_x25519base_byte(out_public_value, private_key_internal_demask);
} else {
curve25519_x25519base_byte_alt(out_public_value, private_key_internal_demask);
}
curve25519_x25519base_byte_selector(out_public_value, private_key_internal_demask);
}

void ed25519_public_key_from_hashed_seed_s2n_bignum(
Expand All @@ -125,29 +71,14 @@ void ed25519_public_key_from_hashed_seed_s2n_bignum(
uint64_t uint64_hashed_seed[4] = {0};
OPENSSL_memcpy(uint64_hashed_seed, az, 32);

if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
edwards25519_scalarmulbase(uint64_point, uint64_hashed_seed);
} else {
edwards25519_scalarmulbase_alt(uint64_point, uint64_hashed_seed);
}
edwards25519_scalarmulbase_selector(uint64_point, uint64_hashed_seed);

edwards25519_encode(out_public_key, uint64_point);
}

void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],
uint8_t r[SHA512_DIGEST_LENGTH], const uint8_t *s, const uint8_t *A,
const void *message, size_t message_len) {

void (*scalarmulbase)(uint64_t res[8],uint64_t scalar[4]);
void (*madd)(uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]);

if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
scalarmulbase = edwards25519_scalarmulbase;
madd = bignum_madd_n25519;
} else {
scalarmulbase = edwards25519_scalarmulbase_alt;
madd = bignum_madd_n25519_alt;
}

uint8_t k[SHA512_DIGEST_LENGTH] = {0};
uint64_t R[8] = {0};
Expand All @@ -162,7 +93,7 @@ void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],
bignum_mod_n25519(uint64_r, 8, uint64_r);

// Compute [r]B.
scalarmulbase(R, uint64_r);
edwards25519_scalarmulbase_selector(R, uint64_r);
edwards25519_encode(out_sig, R);

// Compute k = SHA512(R || A || message)
Expand All @@ -174,34 +105,22 @@ void ed25519_sign_s2n_bignum(uint8_t out_sig[ED25519_SIGNATURE_LEN],

// Compute S = r + k * s modulo the order of the base-point B.
// out_sig = R || S
madd(S, uint64_k, uint64_s, uint64_r);
bignum_madd_n25519_selector(S, uint64_k, uint64_s, uint64_r);
OPENSSL_memcpy(out_sig + 32, S, 32);
}

int ed25519_verify_s2n_bignum(uint8_t R_computed_encoded[32],
const uint8_t public_key[ED25519_PUBLIC_KEY_LEN], uint8_t R_expected[32],
uint8_t S[32], const uint8_t *message, size_t message_len) {

void (*scalarmuldouble)(uint64_t res[8], uint64_t scalar[4],
uint64_t point[8], uint64_t bscalar[4]);
uint64_t (*decode)(uint64_t z[8], const uint8_t c[32]);

if (curve25519_s2n_bignum_use_no_alt_implementation() == 1) {
scalarmuldouble = edwards25519_scalarmuldouble;
decode = edwards25519_decode;
} else {
scalarmuldouble = edwards25519_scalarmuldouble_alt;
decode = edwards25519_decode_alt;
}

uint8_t k[SHA512_DIGEST_LENGTH] = {0};
uint64_t uint64_k[8] = {0};
uint64_t uint64_R[8] = {0};
uint64_t uint64_S[4] = {0};
uint64_t A[8] = {0};

// Decode public key as A'.
if (decode(A, public_key) != 0) {
if (edwards25519_decode_selector(A, public_key) != 0) {
return 0;
}

Expand All @@ -222,7 +141,7 @@ int ed25519_verify_s2n_bignum(uint8_t R_computed_encoded[32],

// Compute R_have <- [S]B - [k]A'.
OPENSSL_memcpy(uint64_S, S, 32);
scalarmuldouble(uint64_R, uint64_k, A, uint64_S);
edwards25519_scalarmuldouble_selector(uint64_R, uint64_k, A, uint64_S);
edwards25519_encode(R_computed_encoded, uint64_R);

return 1;
Expand Down
40 changes: 4 additions & 36 deletions crypto/fipsmodule/ec/p384.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,47 +76,15 @@ static const p384_felem p384_felem_one = {

#if defined(P384_USE_S2N_BIGNUM_FIELD_ARITH)

#if defined(OPENSSL_X86_64)
// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
// for some of the functions. These instructions are not supported by
// every x86 CPU so we have to check if they are available and in case
// they are not we fallback to slightly slower but generic implementation.
static inline uint8_t p384_use_s2n_bignum_alt(void) {
return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
}
#else
// On aarch64 platforms s2n-bignum has two implementations of certain
// functions -- the default one and the alternative (suffixed _alt).
// Depending on the architecture one version is faster than the other.
// Generally, the "_alt" functions are faster on architectures with higher
// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
static inline uint8_t p384_use_s2n_bignum_alt(void) {
return CRYPTO_is_ARMv8_wide_multiplier_capable();
}
#endif

#define p384_felem_add(out, in0, in1) bignum_add_p384(out, in0, in1)
#define p384_felem_sub(out, in0, in1) bignum_sub_p384(out, in0, in1)
#define p384_felem_opp(out, in0) bignum_neg_p384(out, in0)
#define p384_felem_to_bytes(out, in0) bignum_tolebytes_6(out, in0)
#define p384_felem_from_bytes(out, in0) bignum_fromlebytes_6(out, in0)

// The following four functions need bmi2 and adx support.
#define p384_felem_mul(out, in0, in1) \
if (p384_use_s2n_bignum_alt()) bignum_montmul_p384_alt(out, in0, in1); \
else bignum_montmul_p384(out, in0, in1);

#define p384_felem_sqr(out, in0) \
if (p384_use_s2n_bignum_alt()) bignum_montsqr_p384_alt(out, in0); \
else bignum_montsqr_p384(out, in0);

#define p384_felem_to_mont(out, in0) \
if (p384_use_s2n_bignum_alt()) bignum_tomont_p384_alt(out, in0); \
else bignum_tomont_p384(out, in0);

#define p384_felem_from_mont(out, in0) \
if (p384_use_s2n_bignum_alt()) bignum_deamont_p384_alt(out, in0); \
else bignum_deamont_p384(out, in0);
#define p384_felem_to_mont(out, in0) bignum_tomont_p384_selector(out, in0)
#define p384_felem_from_mont(out, in0) bignum_deamont_p384_selector(out, in0)
#define p384_felem_mul(out, in0, in1) bignum_montmul_p384_selector(out, in0, in1)
#define p384_felem_sqr(out, in0) bignum_montsqr_p384_selector(out, in0)

static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
return bignum_nonzero_6(in1);
Expand Down
30 changes: 2 additions & 28 deletions crypto/fipsmodule/ec/p521.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,40 +77,14 @@ static const p521_limb_t p521_felem_p[P521_NLIMBS] = {
0xffffffffffffffff, 0xffffffffffffffff,
0x1ff};

#if defined(OPENSSL_X86_64)
// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets
// for some of the functions. These instructions are not supported by
// every x86 CPU so we have to check if they are available and in case
// they are not we fallback to slightly slower but generic implementation.
static inline uint8_t p521_use_s2n_bignum_alt(void) {
return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable());
}
#else
// On aarch64 platforms s2n-bignum has two implementations of certain
// functions -- the default one and the alternative (suffixed _alt).
// Depending on the architecture one version is faster than the other.
// Generally, the "_alt" functions are faster on architectures with higher
// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips.
static inline uint8_t p521_use_s2n_bignum_alt(void) {
return CRYPTO_is_ARMv8_wide_multiplier_capable();
}
#endif

// s2n-bignum implementation of field arithmetic
#define p521_felem_add(out, in0, in1) bignum_add_p521(out, in0, in1)
#define p521_felem_sub(out, in0, in1) bignum_sub_p521(out, in0, in1)
#define p521_felem_opp(out, in0) bignum_neg_p521(out, in0)
#define p521_felem_to_bytes(out, in0) bignum_tolebytes_p521(out, in0)
#define p521_felem_from_bytes(out, in0) bignum_fromlebytes_p521(out, in0)

// The following two functions need bmi2 and adx support.
#define p521_felem_mul(out, in0, in1) \
if (p521_use_s2n_bignum_alt()) bignum_mul_p521_alt(out, in0, in1); \
else bignum_mul_p521(out, in0, in1);

#define p521_felem_sqr(out, in0) \
if (p521_use_s2n_bignum_alt()) bignum_sqr_p521_alt(out, in0); \
else bignum_sqr_p521(out, in0);
#define p521_felem_mul(out, in0, in1) bignum_mul_p521_selector(out, in0, in1)
#define p521_felem_sqr(out, in0) bignum_sqr_p521_selector(out, in0)

#else // P521_USE_S2N_BIGNUM_FIELD_ARITH

Expand Down
Loading
Loading