From e835d0d761945bb242d271f5ccedf0aee54a4ca8 Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Thu, 30 Jan 2020 20:04:24 +0100 Subject: [PATCH 1/6] Optimize core::ptr::align_offset - Stopping condition inside mod_inv can be >= instead of > - Remove intrinsics::unchecked_rem, we are working modulu powers-of-2 so we can simply mask --- src/libcore/ptr/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 9727e4face56a..8d83937802708 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1083,7 +1083,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // anyway. inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse))) & (going_mod - 1); - if going_mod > m { + if going_mod >= m { return inverse & (m - 1); } going_mod = going_mod.wrapping_mul(going_mod); @@ -1134,7 +1134,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$. let j = a.wrapping_sub(pmoda) >> gcdpow; let k = smoda >> gcdpow; - return intrinsics::unchecked_rem(j.wrapping_mul(mod_inv(k, a)), a >> gcdpow); + return (j.wrapping_mul(mod_inv(k, a))) & ((a >> gcdpow).wrapping_sub(1)); } // Cannot be aligned at all. From 3173cd1473eeebcc9567b686e63d281a761fd936 Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Tue, 28 Jan 2020 22:14:04 +0100 Subject: [PATCH 2/6] Optimize core::ptr::align_offset - When calculating the inverse, it's enough to work `mod a/g` instead of `mod a`. --- src/libcore/ptr/mod.rs | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 8d83937802708..805404b101b74 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1115,26 +1115,33 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a)); let gcd = 1usize << gcdpow; - if p as usize & (gcd - 1) == 0 { + if p as usize & (gcd.wrapping_sub(1)) == 0 { // This branch solves for the following linear congruence equation: // - // $$ p + so ≡ 0 mod a $$ + // ` p + so = 0 mod a ` // - // $p$ here is the pointer value, $s$ – stride of `T`, $o$ offset in `T`s, and $a$ – the + // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the // requested alignment. // - // g = gcd(a, s) - // o = (a - (p mod a))/g * ((s/g)⁻¹ mod a) + // With `g = gcd(a, s)`, and the above asserting that `p` is also divisible by `g`, we can + // denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to: // - // The first term is “the relative alignment of p to a”, the second term is “how does - // incrementing p by s bytes change the relative alignment of p”. Division by `g` is - // necessary to make this equation well formed if $a$ and $s$ are not co-prime. + // ` p' + s'o = 0 mod a' ` + // ` o = (a' - (p' mod a')) * (s'^-1 mod a') ` // - // Furthermore, the result produced by this solution is not “minimal”, so it is necessary - // to take the result $o mod lcm(s, a)$. We can replace $lcm(s, a)$ with just a $a / g$. - let j = a.wrapping_sub(pmoda) >> gcdpow; - let k = smoda >> gcdpow; - return (j.wrapping_mul(mod_inv(k, a))) & ((a >> gcdpow).wrapping_sub(1)); + // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second + // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again + // divided by `g`). + // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not + // co-prime. + // + // Furthermore, the result produced by this solution is not "minimal", so it is necessary + // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`. + let a2 = a >> gcdpow; + let a2minus1 = a2.wrapping_sub(1); + let s2 = smoda >> gcdpow; + let minusp2 = a2.wrapping_sub(pmoda >> gcdpow); + return (minusp2.wrapping_mul(mod_inv(s2, a2))) & a2minus1; } // Cannot be aligned at all. From 22b263ae1837ab6a64fe4bcdbfa07aa8883f57db Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Sun, 2 Feb 2020 02:18:33 +0100 Subject: [PATCH 3/6] Optimize core::ptr::align_offset - As explained in the comment inside mod_inv, it is valid to work mod `usize::max_value()` right until the end. --- src/libcore/ptr/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 805404b101b74..0ee50966f968c 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1081,8 +1081,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod // usize::max_value()` instead, because we take the result `mod n` at the end // anyway. - inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse))) - & (going_mod - 1); + inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse))); if going_mod >= m { return inverse & (m - 1); } From f974aff4e57d432e049f7fe9e0ea218ead78b010 Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Tue, 28 Jan 2020 22:53:28 +0100 Subject: [PATCH 4/6] Optimize core::ptr::align_offset - Instead of squaring the modulu until it is larger than the required one, we double log2 of it. This means a shift instead of mul each iteration. --- src/libcore/ptr/mod.rs | 51 ++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 0ee50966f968c..73fab6d09952e 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1043,49 +1043,55 @@ pub unsafe fn write_volatile(dst: *mut T, src: T) { /// Any questions go to @nagisa. #[lang = "align_offset"] pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { - /// Calculate multiplicative modular inverse of `x` modulo `m`. + /// Calculate multiplicative modular inverse of `x` modulo `m = 2^mpow`. /// /// This implementation is tailored for align_offset and has following preconditions: /// - /// * `m` is a power-of-two; - /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead) + /// * The requested modulo `m` is a power-of-two, so `mpow` can be an argument; + /// * `x < m`; (if `x >= m`, pass in `x % m` instead) /// /// Implementation of this function shall not panic. Ever. #[inline] - fn mod_inv(x: usize, m: usize) -> usize { - /// Multiplicative modular inverse table modulo 2⁴ = 16. + fn mod_pow_2_inv(x: usize, mpow: usize) -> usize { + /// Multiplicative modular inverse table modulo 2^4 = 16. /// /// Note, that this table does not contain values where inverse does not exist (i.e., for - /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.) + /// `0^-1 mod 16`, `2^-1 mod 16`, etc.) const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15]; - /// Modulo for which the `INV_TABLE_MOD_16` is intended. - const INV_TABLE_MOD: usize = 16; - /// INV_TABLE_MOD² - const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD; + /// `t` such that `2^t` is the modulu for which the `INV_TABLE_MOD_16` is intended. + const INV_TABLE_MOD_POW: usize = 4; + const INV_TABLE_MOD_POW_TIMES_2: usize = INV_TABLE_MOD_POW << 1; + const INV_TABLE_MOD: usize = 1 << INV_TABLE_MOD_POW; let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize; - if m <= INV_TABLE_MOD { - table_inverse & (m - 1) + let mask = (1usize << mpow) - 1; + + if mpow <= INV_TABLE_MOD_POW { + table_inverse & mask } else { // We iterate "up" using the following formula: // - // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$ + // ` xy = 1 (mod 2^n) -> xy (2 - xy) = 1 (mod 2^(2n)) ` + // + // until 2^2n ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`. // - // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`. + // Running `k` iterations starting with a solution valid mod `2^t` will get us a + // solution valid mod `2^((2^k) * t)`, so we need to calculate for which `k`, + // `2^k * t > log2(m)`. let mut inverse = table_inverse; - let mut going_mod = INV_TABLE_MOD_SQUARED; + let mut going_modpow = INV_TABLE_MOD_POW_TIMES_2; loop { - // y = y * (2 - xy) mod n + // y = y * (2 - xy) // - // Note, that we use wrapping operations here intentionally – the original formula + // Note, that we use wrapping operations here intentionally - the original formula // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod // usize::max_value()` instead, because we take the result `mod n` at the end // anyway. inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse))); - if going_mod >= m { - return inverse & (m - 1); + if going_modpow >= mpow { + return inverse & mask; } - going_mod = going_mod.wrapping_mul(going_mod); + going_modpow <<= 1; } } } @@ -1111,7 +1117,8 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let smoda = stride & a_minus_one; // a is power-of-two so cannot be 0. stride = 0 is handled above. - let gcdpow = intrinsics::cttz_nonzero(stride).min(intrinsics::cttz_nonzero(a)); + let apow = intrinsics::cttz_nonzero(a); + let gcdpow = intrinsics::cttz_nonzero(stride).min(apow); let gcd = 1usize << gcdpow; if p as usize & (gcd.wrapping_sub(1)) == 0 { @@ -1140,7 +1147,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let a2minus1 = a2.wrapping_sub(1); let s2 = smoda >> gcdpow; let minusp2 = a2.wrapping_sub(pmoda >> gcdpow); - return (minusp2.wrapping_mul(mod_inv(s2, a2))) & a2minus1; + return (minusp2.wrapping_mul(mod_pow_2_inv(s2, apow.wrapping_sub(gcdpow)))) & a2minus1; } // Cannot be aligned at all. From fe1ac2556de0da83f632b7b77fa2cda95b686e85 Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Wed, 29 Jan 2020 16:59:47 +0100 Subject: [PATCH 5/6] Optimize core::ptr::align_offset - Pass mask as arg to mod_pow_2_inv instead of calculating it again. --- src/libcore/ptr/mod.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 73fab6d09952e..643e66834d4d8 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1043,7 +1043,8 @@ pub unsafe fn write_volatile(dst: *mut T, src: T) { /// Any questions go to @nagisa. #[lang = "align_offset"] pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { - /// Calculate multiplicative modular inverse of `x` modulo `m = 2^mpow`. + /// Calculate multiplicative modular inverse of `x` modulo `m`, where + /// `m = 2^mpow` and `mask = m - 1`. /// /// This implementation is tailored for align_offset and has following preconditions: /// @@ -1052,7 +1053,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { /// /// Implementation of this function shall not panic. Ever. #[inline] - fn mod_pow_2_inv(x: usize, mpow: usize) -> usize { + fn mod_pow_2_inv(x: usize, mpow: usize, mask: usize) -> usize { /// Multiplicative modular inverse table modulo 2^4 = 16. /// /// Note, that this table does not contain values where inverse does not exist (i.e., for @@ -1064,7 +1065,6 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { const INV_TABLE_MOD: usize = 1 << INV_TABLE_MOD_POW; let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize; - let mask = (1usize << mpow) - 1; if mpow <= INV_TABLE_MOD_POW { table_inverse & mask @@ -1147,7 +1147,8 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let a2minus1 = a2.wrapping_sub(1); let s2 = smoda >> gcdpow; let minusp2 = a2.wrapping_sub(pmoda >> gcdpow); - return (minusp2.wrapping_mul(mod_pow_2_inv(s2, apow.wrapping_sub(gcdpow)))) & a2minus1; + return (minusp2.wrapping_mul(mod_pow_2_inv(s2, apow.wrapping_sub(gcdpow), a2minus1))) + & a2minus1; } // Cannot be aligned at all. From 006e467b901a7dc31de060b92302a30fde434355 Mon Sep 17 00:00:00 2001 From: Amos Onn Date: Sun, 2 Feb 2020 02:25:43 +0100 Subject: [PATCH 6/6] Optimize core::ptr::align_offset - Remove redundant masking from mod_pow_2_inv, caller site already takes care of this (after mul): according to benchmarking, the best performance is acheived when this masking is present in the fast branch (for small modulos), but absent from the slow branch. --- src/libcore/ptr/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/libcore/ptr/mod.rs b/src/libcore/ptr/mod.rs index 643e66834d4d8..fd8dcfc631233 100644 --- a/src/libcore/ptr/mod.rs +++ b/src/libcore/ptr/mod.rs @@ -1051,6 +1051,9 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { /// * The requested modulo `m` is a power-of-two, so `mpow` can be an argument; /// * `x < m`; (if `x >= m`, pass in `x % m` instead) /// + /// It also sometimes leaves reducing the result modulu `m` to the caller, so the result may be + /// larger than `m`. + /// /// Implementation of this function shall not panic. Ever. #[inline] fn mod_pow_2_inv(x: usize, mpow: usize, mask: usize) -> usize { @@ -1067,6 +1070,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize; if mpow <= INV_TABLE_MOD_POW { + // This is explicitly left here, as benchmarking shows this improves performance. table_inverse & mask } else { // We iterate "up" using the following formula: @@ -1089,7 +1093,7 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { // anyway. inverse = inverse.wrapping_mul(2usize.wrapping_sub(x.wrapping_mul(inverse))); if going_modpow >= mpow { - return inverse & mask; + return inverse; } going_modpow <<= 1; } @@ -1147,6 +1151,8 @@ pub(crate) unsafe fn align_offset(p: *const T, a: usize) -> usize { let a2minus1 = a2.wrapping_sub(1); let s2 = smoda >> gcdpow; let minusp2 = a2.wrapping_sub(pmoda >> gcdpow); + // mod_pow_2_inv returns a result which may be out of `a'`-s range, but it's fine to + // multiply modulu usize::max_value() here, and then take modulu `a'` afterwards. return (minusp2.wrapping_mul(mod_pow_2_inv(s2, apow.wrapping_sub(gcdpow), a2minus1))) & a2minus1; }