From b03eb6d7a07f9f7c33b2623f76d75f11da08d458 Mon Sep 17 00:00:00 2001
From: Tomek Czajka <tczajka@gmail.com>
Date: Thu, 22 Oct 2020 22:28:48 -0700
Subject: [PATCH] Improve performance of the basic division algorithm.

We avoid computing a multiple of the divisor (and thus allocating memory) in the division loop.
Instead, directly subtract a multiple of the divisors from the dividend.

We also compute a more accurate guess for the next digit of the result. The guess is now exact in
almost all cases. In those cases where it's not exact, it will be off by exactly 1.
---
 src/algorithms.rs | 154 ++++++++++++++++++++++++++++++----------------
 src/lib.rs        |   1 +
 tests/biguint.rs  |  14 +++++
 3 files changed, 117 insertions(+), 52 deletions(-)

diff --git a/src/algorithms.rs b/src/algorithms.rs
index c65f3b4f..10f7f22f 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -266,6 +266,35 @@ pub(crate) fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
     }
 }
 
+/// Subtract a multiple.
+/// a -= b * c
+/// Returns a borrow (if a < b then borrow > 0).
+fn sub_mul_digit_same_len(a: &mut [BigDigit], b: &[BigDigit], c: BigDigit) -> BigDigit {
+    debug_assert!(a.len() == b.len());
+
+    // carry is between -big_digit::MAX and 0, so to avoid overflow we store
+    // offset_carry = carry + big_digit::MAX
+    let mut offset_carry = big_digit::MAX;
+
+    for (x, y) in a.iter_mut().zip(b) {
+        // We want to calculate sum = x - y * c + carry.
+        // sum >= -(big_digit::MAX * big_digit::MAX) - big_digit::MAX
+        // sum <= big_digit::MAX
+        // Offsetting sum by (big_digit::MAX << big_digit::BITS) puts it in DoubleBigDigit range.
+        let offset_sum = big_digit::to_doublebigdigit(big_digit::MAX, *x)
+            - big_digit::MAX as DoubleBigDigit
+            + offset_carry as DoubleBigDigit
+            - *y as DoubleBigDigit * c as DoubleBigDigit;
+
+        let (new_offset_carry, new_x) = big_digit::from_doublebigdigit(offset_sum);
+        offset_carry = new_offset_carry;
+        *x = new_x;
+    }
+
+    // Return the borrow.
+    big_digit::MAX - offset_carry
+}
+
 fn bigint_from_slice(slice: &[BigDigit]) -> BigInt {
     BigInt::from(biguint_from_vec(slice.to_vec()))
 }
@@ -631,78 +660,99 @@ pub(crate) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
     (q, r >> shift)
 }
 
-/// an implementation of Knuth, TAOCP vol 2 section 4.3, algorithm D
-///
-/// # Correctness
-///
-/// This function requires the following conditions to run correctly and/or effectively
-///
-/// - `a > b`
-/// - `d.data.len() > 1`
-/// - `d.data.last().unwrap().leading_zeros() == 0`
+/// An implementation of the base division algorithm.
+/// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21.
 fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
-    // The algorithm works by incrementally calculating "guesses", q0, for part of the
-    // remainder. Once we have any number q0 such that q0 * b <= a, we can set
+    debug_assert!(
+        a.data.len() >= b.data.len()
+            && b.data.len() > 1
+            && b.data.last().unwrap().leading_zeros() == 0
+    );
+
+    // The algorithm works by incrementally calculating "guesses", q0, for the next digit of the
+    // quotient. Once we have any number q0 such that (q0 << j) * b <= a, we can set
     //
-    //     q += q0
-    //     a -= q0 * b
+    //     q += q0 << j
+    //     a -= (q0 << j) * b
     //
     // and then iterate until a < b. Then, (q, a) will be our desired quotient and remainder.
     //
-    // q0, our guess, is calculated by dividing the last few digits of a by the last digit of b
-    // - this should give us a guess that is "close" to the actual quotient, but is possibly
-    // greater than the actual quotient. If q0 * b > a, we simply use iterated subtraction
-    // until we have a guess such that q0 * b <= a.
+    // q0, our guess, is calculated by dividing the last three digits of a by the last two digits of
+    // b - this will give us a guess that is close to the actual quotient, but is possibly greater.
+    // It can only be greater by 1 and only in rare cases, with probability at most
+    // 2^-(big_digit::BITS-1) for random a, see TAOCP 4.3.1 exercise 21.
     //
+    // If the quotient turns out to be too large, we adjust it by 1:
+    // q -= 1 << j
+    // a += b << j
+
+    // a0 stores an additional extra most significant digit of the dividend, not stored in a.
+    let mut a0 = 0;
+
+    // [b1, b0] are the two most significant digits of the divisor. They never change.
+    let b0 = *b.data.last().unwrap();
+    let b1 = b.data[b.data.len() - 2];
 
-    let bn = *b.data.last().unwrap();
     let q_len = a.data.len() - b.data.len() + 1;
     let mut q = BigUint {
         data: vec![0; q_len],
     };
 
-    // We reuse the same temporary to avoid hitting the allocator in our inner loop - this is
-    // sized to hold a0 (in the common case; if a particular digit of the quotient is zero a0
-    // can be bigger).
-    //
-    let mut tmp = BigUint {
-        data: Vec::with_capacity(2),
-    };
-
     for j in (0..q_len).rev() {
-        // When calculating our next guess q0, we don't need to consider the digits below j
-        // + b.data.len() - 1: we're guessing digit j of the quotient (i.e. q0 << j) from
-        // digit bn of the divisor (i.e. bn << (b.data.len() - 1) - so the product of those
-        // two numbers will be zero in all digits up to (j + b.data.len() - 1).
-        let offset = j + b.data.len() - 1;
-        if offset >= a.data.len() {
-            continue;
+        debug_assert!(a.data.len() == b.data.len() + j);
+
+        let a1 = *a.data.last().unwrap();
+        let a2 = a.data[a.data.len() - 2];
+
+        // The first q0 estimate is [a1,a0] / b0. It will never be too small, it may be too large
+        // by at most 2.
+        let (mut q0, mut r) = if a0 < b0 {
+            let (q0, r) = div_wide(a0, a1, b0);
+            (q0, r as DoubleBigDigit)
+        } else {
+            debug_assert!(a0 == b0);
+            // Avoid overflowing q0, we know the quotient fits in BigDigit.
+            // [a1,a0] = b0 * (1<<BITS - 1) + (a0 + a1)
+            (big_digit::MAX, a0 as DoubleBigDigit + a1 as DoubleBigDigit)
+        };
+
+        // r = [a1,a0] - q0 * b0
+        //
+        // Now we want to compute a more precise estimate [a2,a1,a0] / [b1,b0] which can only be
+        // less or equal to the current q0.
+        //
+        // q0 is too large if:
+        // [a2,a1,a0] < q0 * [b1,b0]
+        // (r << BITS) + a2 < q0 * b1
+        while r <= big_digit::MAX as DoubleBigDigit
+            && big_digit::to_doublebigdigit(r as BigDigit, a2)
+                < q0 as DoubleBigDigit * b1 as DoubleBigDigit
+        {
+            q0 -= 1;
+            r += b0 as DoubleBigDigit;
         }
 
-        // just avoiding a heap allocation:
-        let mut a0 = tmp;
-        a0.data.truncate(0);
-        a0.data.extend(a.data[offset..].iter().cloned());
-
-        // q0 << j * big_digit::BITS is our actual quotient estimate - we do the shifts
-        // implicitly at the end, when adding and subtracting to a and q. Not only do we
-        // save the cost of the shifts, the rest of the arithmetic gets to work with
-        // smaller numbers.
-        let (mut q0, _) = div_rem_digit(a0, bn);
-        let mut prod = b * &q0;
-
-        while cmp_slice(&prod.data[..], &a.data[j..]) == Greater {
-            q0 -= 1u32;
-            prod -= b;
+        // q0 is now either the correct quotient digit, or in rare cases 1 too large.
+        // Subtract (q0 << j) from a. This may overflow, in which case we will have to correct.
+
+        let mut borrow = sub_mul_digit_same_len(&mut a.data[j..], &b.data, q0);
+        if borrow > a0 {
+            // q0 is too large. We need to add back one multiple of b.
+            q0 -= 1;
+            borrow -= __add2(&mut a.data[j..], &b.data);
         }
+        // The top digit of a, stored in a0, has now been zeroed.
+        debug_assert!(borrow == a0);
 
-        add2(&mut q.data[j..], &q0.data[..]);
-        sub2(&mut a.data[j..], &prod.data[..]);
-        a.normalize();
+        q.data[j] = q0;
 
-        tmp = q0;
+        // Pop off the next top digit of a.
+        a0 = a.data.pop().unwrap();
     }
 
+    a.data.push(a0);
+    a.normalize();
+
     debug_assert!(a < *b);
 
     (q.normalized(), a)
diff --git a/src/lib.rs b/src/lib.rs
index 30c5abec..ab2bd4e3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -266,6 +266,7 @@ mod big_digit {
     pub(crate) const HALF: BigDigit = (1 << HALF_BITS) - 1;
 
     const LO_MASK: DoubleBigDigit = (1 << BITS) - 1;
+    pub(crate) const MAX: BigDigit = LO_MASK as BigDigit;
 
     #[inline]
     fn get_hi(n: DoubleBigDigit) -> BigDigit {
diff --git a/tests/biguint.rs b/tests/biguint.rs
index e14dbdd4..14c33daa 100644
--- a/tests/biguint.rs
+++ b/tests/biguint.rs
@@ -897,6 +897,20 @@ fn test_div_rem() {
     }
 }
 
+#[test]
+fn test_div_rem_big_multiple() {
+    let a = BigUint::from(3u32).pow(100u32);
+    let a2 = &a * &a;
+
+    let (div, rem) = a2.div_rem(&a);
+    assert_eq!(div, a);
+    assert!(rem.is_zero());
+
+    let (div, rem) = (&a2 - 1u32).div_rem(&a);
+    assert_eq!(div, &a - 1u32);
+    assert_eq!(rem, &a - 1u32);
+}
+
 #[test]
 fn test_div_ceil() {
     fn check(a: &BigUint, b: &BigUint, d: &BigUint, m: &BigUint) {