Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix division on SPARC #393

Merged
merged 1 commit into from
Nov 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions src/int/specialized_div_rem/delegate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,133 @@ macro_rules! impl_delegate {
}
};
}

/// Returns `n / d` and sets `*rem = n % d`.
///
/// This specialization exists because:
/// - The LLVM backend for 32-bit SPARC cannot compile functions that return `(u128, u128)`,
/// so we have to use an old fashioned `&mut u128` argument to return the remainder.
/// - 64-bit SPARC does not have u64 * u64 => u128 widening multiplication, which makes the
/// delegate algorithm strategy the only reasonably fast way to perform `u128` division.
#[doc(hidden)]
pub fn u128_divide_sparc(duo: u128, div: u128, rem: &mut u128) -> u128 {
use super::*;
let duo_lo = duo as u64;
let duo_hi = (duo >> 64) as u64;
let div_lo = div as u64;
let div_hi = (div >> 64) as u64;

match (div_lo == 0, div_hi == 0, duo_hi == 0) {
(true, true, _) => zero_div_fn(),
(_, false, true) => {
*rem = duo;
return 0;
}
(false, true, true) => {
let tmp = u64_by_u64_div_rem(duo_lo, div_lo);
*rem = tmp.1 as u128;
return tmp.0 as u128;
}
(false, true, false) => {
if duo_hi < div_lo {
let norm_shift = u64_normalization_shift(div_lo, duo_hi, false);
let shl = if norm_shift == 0 {
64 - 1
} else {
64 - norm_shift
};

let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
let mut duo = duo;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (quo_lo | tmp.0) as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
} else if duo_hi == div_lo {
let tmp = u64_by_u64_div_rem(duo as u64, div as u64);
*rem = tmp.1 as u128;
return (1 << 64) | (tmp.0 as u128);
} else {
if (div_lo >> 32) == 0 {
let div_0 = div_lo as u32 as u64;
let (quo_hi, rem_3) = u64_by_u64_div_rem(duo_hi, div_0);

let duo_mid = ((duo >> 32) as u32 as u64) | (rem_3 << 32);
let (quo_1, rem_2) = u64_by_u64_div_rem(duo_mid, div_0);

let duo_lo = (duo as u32 as u64) | (rem_2 << 32);
let (quo_0, rem_1) = u64_by_u64_div_rem(duo_lo, div_0);

*rem = rem_1 as u128;
return (quo_0 as u128) | ((quo_1 as u128) << 32) | ((quo_hi as u128) << 64);
}

let duo_lo = duo as u64;
let tmp = u64_by_u64_div_rem(duo_hi, div_lo);
let quo_hi = tmp.0;
let mut duo = (duo_lo as u128) | ((tmp.1 as u128) << 64);
if duo < div {
*rem = duo;
return (quo_hi as u128) << 64;
}

let mut div: u128 = div << (64 - 1);
let mut pow_lo: u64 = 1 << (64 - 1);
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
let duo_hi = (duo >> 64) as u64;
if duo_hi == 0 {
let tmp = u64_by_u64_div_rem(duo as u64, div_lo);
*rem = tmp.1 as u128;
return (tmp.0) as u128 | (quo_lo as u128) | ((quo_hi as u128) << 64);
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
(_, false, false) => {
if duo < div {
*rem = duo;
return 0;
}
let div_original = div;
let shl = u64_normalization_shift(duo_hi, div_hi, false);
let mut duo = duo;
let mut div: u128 = div << shl;
let mut pow_lo: u64 = 1 << shl;
let mut quo_lo: u64 = 0;
loop {
let sub = duo.wrapping_sub(div);
if 0 <= (sub as i128) {
duo = sub;
quo_lo |= pow_lo;
if duo < div_original {
*rem = duo;
return quo_lo as u128;
}
}
div >>= 1;
pow_lo >>= 1;
}
}
}
}
49 changes: 28 additions & 21 deletions src/int/specialized_div_rem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ mod binary_long;

#[macro_use]
mod delegate;
pub use self::delegate::u128_divide_sparc;

#[macro_use]
mod trifecta;
Expand All @@ -60,27 +61,31 @@ fn zero_div_fn() -> ! {
unsafe { core::hint::unreachable_unchecked() }
}

// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
const USE_LZ: bool = cfg!(target_feature = "b");

#[cfg(target_arch = "arm")]
const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This
// is needed to successfully differentiate between targets like `thumbv8.base` and
// `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported.
// Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to
// work.
cfg!(target_feature = "v5te")
const USE_LZ: bool = {
if cfg!(target_arch = "arm") {
if cfg!(target_feature = "thumb-mode") {
// ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
// supported. This is needed to successfully differentiate between targets like
// `thumbv8.base` and `thumbv8.main`.
cfg!(target_feature = "v6t2")
} else {
// Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
// supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
// feature does not seem to work.
cfg!(target_feature = "v5te")
}
} else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
// LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
cfg!(target_feature = "vis3")
} else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
// The `B` extension on RISC-V determines if a CLZ assembly instruction exists
cfg!(target_feature = "b")
} else {
// All other common targets Rust supports should have CLZ instructions
true
}
};

// All other targets Rust supports have CLZ instructions
#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))]
const USE_LZ: bool = true;

impl_normalization_shift!(
u32_normalization_shift,
USE_LZ,
Expand Down Expand Up @@ -115,8 +120,9 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
not(any(target_pointer_width = "16", target_pointer_width = "32")),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
not(any(target_pointer_width = "16", target_pointer_width = "32"))
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
u128_div_rem,
Expand All @@ -131,8 +137,9 @@ impl_trifecta!(
// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
any(target_pointer_width = "16", target_pointer_width = "32"),
not(all(not(feature = "no-asm"), target_arch = "x86_64")),
any(target_pointer_width = "16", target_pointer_width = "32")
not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
u128_div_rem,
Expand Down
38 changes: 32 additions & 6 deletions src/int/udiv.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub use int::specialized_div_rem::u128_divide_sparc;
use int::specialized_div_rem::*;

intrinsics! {
Expand Down Expand Up @@ -46,25 +47,50 @@ intrinsics! {
quo_rem.0
}

// Note: we use block configuration and not `if cfg!(...)`, because we need to entirely disable
// the existence of `u128_div_rem` to get 32-bit SPARC to compile, see `u128_divide_sparc` docs.

#[win64_128bit_abi_hack]
/// Returns `n / d`
pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 {
u128_div_rem(n, d).0
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
u128_div_rem(n, d).0
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
u128_divide_sparc(n, d, &mut 0)
}
}

#[win64_128bit_abi_hack]
/// Returns `n % d`
pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 {
u128_div_rem(n, d).1
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
u128_div_rem(n, d).1
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
let mut rem = 0;
u128_divide_sparc(n, d, &mut rem);
rem
}
}

#[win64_128bit_abi_hack]
/// Returns `n / d` and sets `*rem = n % d`
pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 {
let quo_rem = u128_div_rem(n, d);
if let Some(rem) = rem {
*rem = quo_rem.1;
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))] {
let quo_rem = u128_div_rem(n, d);
if let Some(rem) = rem {
*rem = quo_rem.1;
}
quo_rem.0
}
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))] {
let mut tmp = 0;
let quo = u128_divide_sparc(n, d, &mut tmp);
if let Some(rem) = rem {
*rem = tmp;
}
quo
}
quo_rem.0
}
}