diff --git a/build.rs b/build.rs
index da1379f7b..eae7bcaf2 100644
--- a/build.rs
+++ b/build.rs
@@ -40,12 +40,12 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/mem.c"),
     (&[], "crypto/poly1305/poly1305.c"),
 
-    (&[AARCH64, ARM, X86_64, X86], "crypto/crypto.c"),
-    (&[AARCH64, ARM, X86_64, X86], "crypto/curve25519/curve25519.c"),
-    (&[AARCH64, ARM, X86_64, X86], "crypto/fipsmodule/ec/ecp_nistz.c"),
-    (&[AARCH64, ARM, X86_64, X86], "crypto/fipsmodule/ec/gfp_p256.c"),
-    (&[AARCH64, ARM, X86_64, X86], "crypto/fipsmodule/ec/gfp_p384.c"),
-    (&[AARCH64, ARM, X86_64, X86], "crypto/fipsmodule/ec/p256.c"),
+    (&[], "crypto/crypto.c"),
+    (&[], "crypto/curve25519/curve25519.c"),
+    (&[], "crypto/fipsmodule/ec/ecp_nistz.c"),
+    (&[], "crypto/fipsmodule/ec/gfp_p256.c"),
+    (&[], "crypto/fipsmodule/ec/gfp_p384.c"),
+    (&[], "crypto/fipsmodule/ec/p256.c"),
 
     (&[X86_64, X86], "crypto/cpu-intel.c"),
 
diff --git a/crypto/fipsmodule/aes/aes_nohw.c b/crypto/fipsmodule/aes/aes_nohw.c
index a6306f049..d871d82c7 100644
--- a/crypto/fipsmodule/aes/aes_nohw.c
+++ b/crypto/fipsmodule/aes/aes_nohw.c
@@ -346,19 +346,18 @@ static inline uint8_t lo(uint32_t a) {
 
 static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
                                           const uint8_t in[16]) {
-  OPENSSL_memcpy(out, in, 16);
 #if defined(OPENSSL_SSE2)
-  // No conversions needed.
+  OPENSSL_memcpy(out, in, 16);  // No conversions needed.
 #elif defined(OPENSSL_64_BIT)
-  uint64_t a0 = aes_nohw_compact_word(out[0]);
-  uint64_t a1 = aes_nohw_compact_word(out[1]);
+  uint64_t a0 = aes_nohw_compact_word(CRYPTO_read_le64(in));
+  uint64_t a1 = aes_nohw_compact_word(CRYPTO_read_le64(in + 8));
   out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
   out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
 #else
-  uint32_t a0 = aes_nohw_compact_word(out[0]);
-  uint32_t a1 = aes_nohw_compact_word(out[1]);
-  uint32_t a2 = aes_nohw_compact_word(out[2]);
-  uint32_t a3 = aes_nohw_compact_word(out[3]);
+  uint32_t a0 = aes_nohw_compact_word(CRYPTO_read_le32(in));
+  uint32_t a1 = aes_nohw_compact_word(CRYPTO_read_le32(in + 4));
+  uint32_t a2 = aes_nohw_compact_word(CRYPTO_read_le32(in + 8));
+  uint32_t a3 = aes_nohw_compact_word(CRYPTO_read_le32(in + 12));
   // Note clang, when building for ARM Thumb2, will sometimes miscompile
   // expressions such as (a0 & 0x0000ff00) << 8, particularly when building
   // without optimizations. This bug was introduced in
@@ -382,8 +381,8 @@ static inline void aes_nohw_uncompact_block(
       aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
   uint64_t b1 =
       aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
-  OPENSSL_memcpy(out, &b0, 8);
-  OPENSSL_memcpy(out + 8, &b1, 8);
+  CRYPTO_write_le64(b0, out);
+  CRYPTO_write_le64(b1, out + 8);
 #else
   uint32_t a0 = in[0];
   uint32_t a1 = in[1];
@@ -404,10 +403,10 @@ static inline void aes_nohw_uncompact_block(
   b1 = aes_nohw_uncompact_word(b1);
   b2 = aes_nohw_uncompact_word(b2);
   b3 = aes_nohw_uncompact_word(b3);
-  OPENSSL_memcpy(out, &b0, 4);
-  OPENSSL_memcpy(out + 4, &b1, 4);
-  OPENSSL_memcpy(out + 8, &b2, 4);
-  OPENSSL_memcpy(out + 12, &b3, 4);
+  CRYPTO_write_le32(b0, out);
+  CRYPTO_write_le32(b1, out + 4);
+  CRYPTO_write_le32(b2, out + 8);
+  CRYPTO_write_le32(b3, out + 12);
 #endif
 }
 
@@ -925,18 +924,17 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
 
   // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
   alignas(AES_NOHW_WORD_SIZE) union {
-    uint32_t u32[AES_NOHW_BATCH_SIZE * 4];
     uint8_t u8[AES_NOHW_BATCH_SIZE * 16];
   } ivs, enc_ivs;
   for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
     OPENSSL_memcpy(ivs.u8 + 16 * i, ivec, 16);
   }
 
-  uint32_t ctr = CRYPTO_bswap4(ivs.u32[3]);
+  uint32_t ctr = CRYPTO_read_be32(ivs.u8 + 12);
   for (;;) {
     // Update counters.
     for (uint32_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
-      ivs.u32[4 * i + 3] = CRYPTO_bswap4(ctr + i);
+      CRYPTO_write_be32(ctr + i, ivs.u8 + 16 * i + 12);
     }
 
     size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c
index b1f1c6932..e047bf5a6 100644
--- a/crypto/fipsmodule/bn/montgomery.c
+++ b/crypto/fipsmodule/bn/montgomery.c
@@ -156,3 +156,17 @@ int bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[],
   }
   return 1;
 }
+
+#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) && \
+    !defined(OPENSSL_ARM) && !defined(OPENSSL_AARCH64)
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                 const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+  Limb tmp[2 * num];
+  for (size_t i = 0; i < num; i++)
+    tmp[i] = 0;
+  for (size_t i = 0; i < num; i++)
+    tmp[num + i] = limbs_mul_add_limb(tmp + i, ap, bp[i], num);
+
+  bn_from_montgomery_in_place(rp, num, tmp, 2 * num, np, num, n0);
+}
+#endif
diff --git a/crypto/fipsmodule/ec/p256_shared.h b/crypto/fipsmodule/ec/p256_shared.h
index 4dd325bee..3c1f107a6 100644
--- a/crypto/fipsmodule/ec/p256_shared.h
+++ b/crypto/fipsmodule/ec/p256_shared.h
@@ -50,7 +50,14 @@ typedef unsigned char P256_SCALAR_BYTES[33];
 
 static inline void p256_scalar_bytes_from_limbs(
     P256_SCALAR_BYTES bytes_out, const BN_ULONG limbs[P256_LIMBS]) {
-  OPENSSL_memcpy(bytes_out, limbs, 32);
+  for (int i = 0; i < P256_LIMBS; i++)
+  {
+#if BN_BITS2 == 64
+    CRYPTO_write_le64(limbs[i], bytes_out + i * 8);
+#else
+    CRYPTO_write_le32(limbs[i], bytes_out + i * 4);
+#endif
+  }
   bytes_out[32] = 0;
 }
 
diff --git a/crypto/internal.h b/crypto/internal.h
index b975c0b53..88e19e5b9 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -254,19 +254,77 @@ static inline crypto_word constant_time_select_w(crypto_word mask,
 
 // Endianness conversions.
 
-#if defined(__GNUC__) && __GNUC__ >= 2
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return __builtin_bswap32(x);
+static inline uint32_t CRYPTO_read_le32(const uint8_t *p) {
+  return (((uint32_t)p[0]) |
+          (((uint32_t)p[1]) << 8) |
+          (((uint32_t)p[2]) << 16) |
+          (((uint32_t)p[3]) << 24));
 }
-#elif defined(_MSC_VER)
-#pragma warning(push, 3)
-#include <stdlib.h>
-#pragma warning(pop)
-#pragma intrinsic(_byteswap_uint64, _byteswap_ulong)
-static inline uint32_t CRYPTO_bswap4(uint32_t x) {
-  return _byteswap_ulong(x);
+
+static inline uint32_t CRYPTO_read_be32(const uint8_t *p) {
+  return ((((uint32_t)p[0]) << 24) |
+          (((uint32_t)p[1]) << 16) |
+          (((uint32_t)p[2]) << 8) |
+          ((uint32_t)p[3]));
+}
+
+static inline uint64_t CRYPTO_read_le64(const uint8_t *p) {
+  return (((uint64_t)p[0]) |
+          (((uint64_t)p[1]) << 8) |
+          (((uint64_t)p[2]) << 16) |
+          (((uint64_t)p[3]) << 24) |
+          (((uint64_t)p[4]) << 32) |
+          (((uint64_t)p[5]) << 40) |
+          (((uint64_t)p[6]) << 48) |
+          (((uint64_t)p[7]) << 56));
+}
+
+static inline uint64_t CRYPTO_read_be64(const uint8_t *p) {
+  return ((((uint64_t)p[0]) << 56) |
+          (((uint64_t)p[1]) << 48) |
+          (((uint64_t)p[2]) << 40) |
+          (((uint64_t)p[3]) << 32) |
+          (((uint64_t)p[4]) << 24) |
+          (((uint64_t)p[5]) << 16) |
+          (((uint64_t)p[6]) << 8) |
+          ((uint64_t)p[7]));
+}
+
+static inline void CRYPTO_write_le32(uint32_t v, uint8_t *p) {
+  p[0] = (uint8_t)(v & 0xff);
+  p[1] = (uint8_t)((v >> 8) & 0xff);
+  p[2] = (uint8_t)((v >> 16) & 0xff);
+  p[3] = (uint8_t)((v >> 24) & 0xff);
+}
+
+static inline void CRYPTO_write_be32(uint32_t v, uint8_t *p) {
+  p[0] = (uint8_t)((v >> 24) & 0xff);
+  p[1] = (uint8_t)((v >> 16) & 0xff);
+  p[2] = (uint8_t)((v >> 8) & 0xff);
+  p[3] = (uint8_t)(v & 0xff);
+}
+
+static inline void CRYPTO_write_le64(uint64_t v, uint8_t *p) {
+  p[0] = (uint8_t)(v & 0xff);
+  p[1] = (uint8_t)((v >> 8) & 0xff);
+  p[2] = (uint8_t)((v >> 16) & 0xff);
+  p[3] = (uint8_t)((v >> 24) & 0xff);
+  p[4] = (uint8_t)((v >> 32) & 0xff);
+  p[5] = (uint8_t)((v >> 40) & 0xff);
+  p[6] = (uint8_t)((v >> 48) & 0xff);
+  p[7] = (uint8_t)((v >> 56) & 0xff);
+}
+
+static inline void CRYPTO_write_be64(uint64_t v, uint8_t *p) {
+  p[0] = (uint8_t)((v >> 56) & 0xff);
+  p[1] = (uint8_t)((v >> 48) & 0xff);
+  p[2] = (uint8_t)((v >> 40) & 0xff);
+  p[3] = (uint8_t)((v >> 32) & 0xff);
+  p[4] = (uint8_t)((v >> 24) & 0xff);
+  p[5] = (uint8_t)((v >> 16) & 0xff);
+  p[6] = (uint8_t)((v >> 8) & 0xff);
+  p[7] = (uint8_t)(v & 0xff);
 }
-#endif
 
 #if !defined(RING_CORE_NOSTDLIBINC)
 #include <string.h>
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
index a2f0b987f..cd1c7d3d5 100644
--- a/crypto/poly1305/poly1305.c
+++ b/crypto/poly1305/poly1305.c
@@ -29,15 +29,12 @@
 #pragma GCC diagnostic ignored "-Wconversion"
 #endif
 
-// We can assume little-endian.
 static uint32_t U8TO32_LE(const uint8_t *m) {
-  uint32_t r;
-  OPENSSL_memcpy(&r, m, sizeof(r));
-  return r;
+  return CRYPTO_read_le32(m);
 }
 
 static void U32TO8_LE(uint8_t *m, uint32_t v) {
-  OPENSSL_memcpy(m, &v, sizeof(v));
+  CRYPTO_write_le32(v, m);
 }
 
 static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
diff --git a/include/ring-core/base.h b/include/ring-core/base.h
index f1a027d1a..69cf40ca2 100644
--- a/include/ring-core/base.h
+++ b/include/ring-core/base.h
@@ -89,6 +89,9 @@
 #elif defined(__MIPSEL__) && defined(__LP64__)
 #define OPENSSL_64_BIT
 #define OPENSSL_MIPS64
+#elif defined(__s390x__)
+#define OPENSSL_64_BIT
+#define OPENSSL_S390X
 #elif defined(__wasm__)
 #define OPENSSL_32_BIT
 #else