Skip to content

Commit 81a75dd

Browse files
authored
Rollup merge of rust-lang#103378 - nagisa:fix-infinite-offset, r=scottmcm
Fix mod_inv termination for the last iteration On usize=u64 platforms, the 4th iteration would overflow the `mod_gate` back to 0. Similarly for usize=u32 platforms, the 3rd iteration would overflow much the same way. I tested various approaches to resolving this, including approaches with `saturating_mul` and `widening_mul` to a double usize. Turns out LLVM likes `mul_with_overflow` the best. In fact now, that LLVM can see the iteration count is limited, it will happily unroll the loop into a nice linear sequence. You will also notice that the code around the loop got simplified somewhat. Now that LLVM is handling the loop nicely, there isn’t any more reasons to manually unroll the first iteration out of the loop (though looking at the code today I’m not sure all that complexity was necessary in the first place). Fixes rust-lang#103361
2 parents 5dc1f96 + a3c3f72 commit 81a75dd

File tree

2 files changed

+40
-26
lines changed

2 files changed

+40
-26
lines changed

library/core/src/ptr/mod.rs

+28-26
Original file line numberDiff line numberDiff line change
@@ -1591,8 +1591,8 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
15911591
// FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
15921592
// 1, where the method versions of these operations are not inlined.
15931593
use intrinsics::{
1594-
cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
1595-
wrapping_add, wrapping_mul, wrapping_sub,
1594+
cttz_nonzero, exact_div, mul_with_overflow, unchecked_rem, unchecked_shl, unchecked_shr,
1595+
unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
15961596
};
15971597

15981598
/// Calculate multiplicative modular inverse of `x` modulo `m`.
@@ -1612,36 +1612,38 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
16121612
const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15];
16131613
/// Modulo for which the `INV_TABLE_MOD_16` is intended.
16141614
const INV_TABLE_MOD: usize = 16;
1615-
/// INV_TABLE_MOD²
1616-
const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD;
16171615

1618-
let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize;
16191616
// SAFETY: `m` is required to be a power-of-two, hence non-zero.
16201617
let m_minus_one = unsafe { unchecked_sub(m, 1) };
1621-
if m <= INV_TABLE_MOD {
1622-
table_inverse & m_minus_one
1623-
} else {
1624-
// We iterate "up" using the following formula:
1625-
//
1626-
// $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1618+
let mut inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize;
1619+
let mut mod_gate = INV_TABLE_MOD;
1620+
// We iterate "up" using the following formula:
1621+
//
1622+
// $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1623+
//
1624+
// This application needs to be applied at least until `2²ⁿ ≥ m`, at which point we can
1625+
// finally reduce the computation to our desired `m` by taking `inverse mod m`.
1626+
//
1627+
// This computation is `O(log log m)`, which is to say, that on 64-bit machines this loop
1628+
// will always finish in at most 4 iterations.
1629+
loop {
1630+
// y = y * (2 - xy) mod n
16271631
//
1628-
// until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1629-
let mut inverse = table_inverse;
1630-
let mut going_mod = INV_TABLE_MOD_SQUARED;
1631-
loop {
1632-
// y = y * (2 - xy) mod n
1633-
//
1634-
// Note, that we use wrapping operations here intentionally – the original formula
1635-
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1636-
// usize::MAX` instead, because we take the result `mod n` at the end
1637-
// anyway.
1638-
inverse = wrapping_mul(inverse, wrapping_sub(2usize, wrapping_mul(x, inverse)));
1639-
if going_mod >= m {
1640-
return inverse & m_minus_one;
1641-
}
1642-
going_mod = wrapping_mul(going_mod, going_mod);
1632+
// Note, that we use wrapping operations here intentionally – the original formula
1633+
// uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1634+
// usize::MAX` instead, because we take the result `mod n` at the end
1635+
// anyway.
1636+
if mod_gate >= m {
1637+
break;
1638+
}
1639+
inverse = wrapping_mul(inverse, wrapping_sub(2usize, wrapping_mul(x, inverse)));
1640+
let (new_gate, overflow) = mul_with_overflow(mod_gate, mod_gate);
1641+
if overflow {
1642+
break;
16431643
}
1644+
mod_gate = new_gate;
16441645
}
1646+
inverse & m_minus_one
16451647
}
16461648

16471649
let addr = p.addr();

library/core/tests/ptr.rs

+12
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,18 @@ fn align_offset_various_strides() {
455455
assert!(!x);
456456
}
457457

458+
#[test]
459+
fn align_offset_issue_103361() {
460+
#[cfg(target_pointer_width = "64")]
461+
const SIZE: usize = 1 << 47;
462+
#[cfg(target_pointer_width = "32")]
463+
const SIZE: usize = 1 << 30;
464+
#[cfg(target_pointer_width = "16")]
465+
const SIZE: usize = 1 << 13;
466+
struct HugeSize([u8; SIZE - 1]);
467+
let _ = (SIZE as *const HugeSize).align_offset(SIZE);
468+
}
469+
458470
#[test]
459471
fn offset_from() {
460472
let mut a = [0; 5];

0 commit comments

Comments
 (0)