From e835d0d761945bb242d271f5ccedf0aee54a4ca8 Mon Sep 17 00:00:00 2001
From: Amos Onn <amosonn@gmail.com>
Date: Thu, 30 Jan 2020 20:04:24 +0100
Subject: [PATCH 1/3] Optimize core::ptr::align_offset

- Stopping condition inside mod_inv can be >= instead of >
- Remove intrinsics::unchecked_rem, we are working modulu powers-of-2 so
we can simply mask
---
 src/libcore/ptr/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs
index 9727e4face56a..8d83937802708 100644
--- a/src/libcore/ptr/mod.rs
+++ b/src/libcore/ptr/mod.rs
@@ -1083,7 +1083,7 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
                 // anyway.
                 inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)))
                     & (going_mod - 1);
-                if going_mod > m {
+                if going_mod >= m {
                     return inverse & (m - 1);
                 }
                 going_mod = going_mod.wrapping_mul(going_mod);
@@ -1134,7 +1134,7 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
         // to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$.
         let j = a.wrapping_sub(pmoda) >> gcdpow;
         let k = smoda >> gcdpow;
-        return intrinsics::unchecked_rem(j.wrapping_mul(mod_inv(k, a)), a >> gcdpow);
+        return (j.wrapping_mul(mod_inv(k, a))) & ((a >> gcdpow).wrapping_sub(1));
     }
 
     // Cannot be aligned at all.

From 3173cd1473eeebcc9567b686e63d281a761fd936 Mon Sep 17 00:00:00 2001
From: Amos Onn <amosonn@gmail.com>
Date: Tue, 28 Jan 2020 22:14:04 +0100
Subject: [PATCH 2/3] Optimize core::ptr::align_offset

- When calculating the inverse, it's enough to work `mod a/g` instead
  of `mod a`.
---
 src/libcore/ptr/mod.rs | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs
index 8d83937802708..805404b101b74 100644
--- a/src/libcore/ptr/mod.rs
+++ b/src/libcore/ptr/mod.rs
@@ -1115,26 +1115,33 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a));
     let gcd = 1usize << gcdpow;
 
-    if p as usize & (gcd - 1) == 0 {
+    if p as usize & (gcd.wrapping_sub(1)) == 0 {
         // This branch solves for the following linear congruence equation:
         //
-        // $$ p + so ≡ 0 mod a $$
+        // ` p + so = 0 mod a `
         //
-        // $p$ here is the pointer value, $s$ – stride of `T`, $o$ offset in `T`s, and $a$ – the
+        // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
         // requested alignment.
         //
-        // g = gcd(a, s)
-        // o = (a - (p mod a))/g * ((s/g)⁻¹ mod a)
+        // With `g = gcd(a, s)`, and the above asserting that `p` is also divisible by `g`, we can
+        // denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
         //
-        // The first term is “the relative alignment of p to a”, the second term is “how does
-        // incrementing p by s bytes change the relative alignment of p”. Division by `g` is
-        // necessary to make this equation well formed if $a$ and $s$ are not co-prime.
+        // ` p' + s'o = 0 mod a' `
+        // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
         //
-        // Furthermore, the result produced by this solution is not “minimal”, so it is necessary
-        // to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$.
-        let j = a.wrapping_sub(pmoda) >> gcdpow;
-        let k = smoda >> gcdpow;
-        return (j.wrapping_mul(mod_inv(k, a))) & ((a >> gcdpow).wrapping_sub(1));
+        // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
+        // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
+        // divided by `g`).
+        // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
+        // co-prime.
+        //
+        // Furthermore, the result produced by this solution is not "minimal", so it is necessary
+        // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.
+        let a2 = a >> gcdpow;
+        let a2minus1 = a2.wrapping_sub(1);
+        let s2 = smoda >> gcdpow;
+        let minusp2 = a2.wrapping_sub(pmoda >> gcdpow);
+        return (minusp2.wrapping_mul(mod_inv(s2, a2))) & a2minus1;
     }
 
     // Cannot be aligned at all.

From 22b263ae1837ab6a64fe4bcdbfa07aa8883f57db Mon Sep 17 00:00:00 2001
From: Amos Onn <amosonn@gmail.com>
Date: Sun, 2 Feb 2020 02:18:33 +0100
Subject: [PATCH 3/3] Optimize core::ptr::align_offset

- As explained in the comment inside mod_inv, it is valid to work mod
  `usize::max_value()` right until the end.
---
 src/libcore/ptr/mod.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs
index 805404b101b74..0ee50966f968c 100644
--- a/src/libcore/ptr/mod.rs
+++ b/src/libcore/ptr/mod.rs
@@ -1081,8 +1081,7 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
                 // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
                 // usize::max_value()` instead, because we take the result `mod n` at the end
                 // anyway.
-                inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)))
-                    & (going_mod - 1);
+                inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse)));
                 if going_mod >= m {
                     return inverse & (m - 1);
                 }