From 228b343f390ac91102b4f290fb3ddd8d29f06ef6 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sun, 17 Mar 2024 20:39:55 -0500
Subject: [PATCH 01/12] initial

---
 Objects/listobject.c | 97 ++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 6f919ce02b3ce2..c605b703a0db08 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1595,6 +1595,15 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 /* Avoid malloc for small temp arrays. */
 #define MERGESTATE_TEMP_SIZE 256
 
+/* The largest value of minrun. This must be a power of 2, and >= 1, so that
+ * the compute_minrun() algorithm guarantees to return a result no larger than
+ * this,
+ */
+#define MAX_MINRUN 64
+#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
+#error "MAX_MINRUN must be a power of 2, and >= 1"
+#endif
+
 /* One MergeState exists on the stack per invocation of mergesort.  It's just
  * a convenient way to pass state around among the helper functions.
  */
@@ -1665,53 +1674,53 @@ struct s_MergeState {
    the input (nothing is lost or duplicated).
 */
 static int
-binarysort(MergeState *ms, sortslice lo, PyObject **hi, PyObject **start)
+binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
 {
-    Py_ssize_t k;
-    PyObject **l, **p, **r;
+    Py_ssize_t k; /* for IFLT macro expansion */
+    PyObject ** const a = ss->keys;
+    PyObject ** const v = ss->values;
+    const bool has_values = v != NULL;
     PyObject *pivot;
-
-    assert(lo.keys <= start && start <= hi);
-    /* assert [lo.keys, start) is sorted */
-    if (lo.keys == start)
-        ++start;
-    for (; start < hi; ++start) {
-        /* set l to where *start belongs */
-        l = lo.keys;
-        r = start;
-        pivot = *r;
-        /* Invariants:
-         * pivot >= all in [lo.keys, l).
-         * pivot  < all in [r, start).
-         * These are vacuously true at the start.
+    Py_ssize_t L, M, R;
+
+    assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
+     /* assert s[:ok) is sorted */
+
+    if (! ok)
+        ++ok;
+    for (; ok < n; ++ok) {
+s        /* set L to where a[ok] belongs */
+        L = 0;
+        R = ok;
+        pivot = a[ok];
+        /* Slice invariants. vacuously true at the start:
+         * 0:L  <= pivot
+         * L:R     unknown
+         * R:ok  > pivot
          */
-        assert(l < r);
+        assert(L < R);
         do {
-            p = l + ((r - l) >> 1);
-            IFLT(pivot, *p)
-                r = p;
+            /* don't do silly ;-) things to prevent overflow when finding
+               the midpoint; L and R are very far from filling a Py_ssize_t */
+            M = (L + R) >> 1;
+            IFLT(pivot, a[M])
+                R = M;
             else
-                l = p+1;
-        } while (l < r);
-        assert(l == r);
-        /* The invariants still hold, so pivot >= all in [lo.keys, l) and
-           pivot < all in [l, start), so pivot belongs at l.  Note
-           that if there are elements equal to pivot, l points to the
-           first slot after them -- that's why this sort is stable.
-           Slide over to make room.
-           Caution: using memmove is much slower under MSVC 5;
-           we're not usually moving many slots. */
-        for (p = start; p > l; --p)
-            *p = *(p-1);
-        *l = pivot;
-        if (lo.values != NULL) {
-            Py_ssize_t offset = lo.values - lo.keys;
-            p = start + offset;
-            pivot = *p;
-            l += offset;
-            for ( ; p > l; --p)
-                *p = *(p-1);
-            *l = pivot;
+                L = M + 1;
+        } while (L < R);
+        assert(L == R);
+        /* a[:L] holds all elements from a[:ok[ <= pivot now, so pivot belongs
+           at index L. Slide a[L:ok] to the right a slot to make room for it.
+           Caution: using memmove is much slower under MSVC 5; we're not
+           usually moving many slots. */
+        for (M = ok; M > L; --M)
+            a[M] = a[M - 1];
+        a[L] = pivot;
+        if (has_values) {
+            pivot = v[ok];
+            for (M = ok; M > L; --M)
+                v[M] = v[M - 1];
+            v[L] = pivot;
         }
     }
     return 0;
@@ -2539,7 +2548,7 @@ merge_compute_minrun(Py_ssize_t n)
     Py_ssize_t r = 0;           /* becomes 1 if any 1 bits are shifted off */
 
     assert(n >= 0);
-    while (n >= 64) {
+    while (n >= MAX_MINRUN) {
         r |= n & 1;
         n >>= 1;
     }
@@ -2923,7 +2932,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
         if (n < minrun) {
             const Py_ssize_t force = nremaining <= minrun ?
                               nremaining : minrun;
-            if (binarysort(&ms, lo, lo.keys + force, lo.keys + n) < 0)
+            if (binarysort(&ms, &lo, force, n) < 0)
                 goto fail;
             n = force;
         }

From e3bed6157d267b7c02b23d8225538b42bdd19e50 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sun, 17 Mar 2024 20:58:26 -0500
Subject: [PATCH 02/12] Add function comment.

---
 Objects/listobject.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index c605b703a0db08..d6233f931ffdb3 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1661,15 +1661,14 @@ struct s_MergeState {
     int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
 };
 
-/* binarysort is the best method for sorting small arrays: it does
-   few compares, but can do data movement quadratic in the number of
-   elements.
-   [lo.keys, hi) is a contiguous slice of a list of keys, and is sorted via
-   binary insertion.  This sort is stable.
-   On entry, must have lo.keys <= start <= hi, and that
-   [lo.keys, start) is already sorted (pass start == lo.keys if you don't
-   know!).
-   If islt() complains return -1, else 0.
+/* binarysort is the best method for sorting small arrays: it does few
+   compares, but can do data movement quadratic in the number of elements.
+   ss->keys is viewed as an array of n kays, a[:n]. a[:ok] is already sorted.
+   Pass ok = 0 (or 1) if you don't know.
+   It's sorted in-place, by a stable binary insertion sort. If ss->values
+   isn't NULL, it's permuted in lockstap with ss->keys.
+   On entry, must have n >= 1, and 0 <= ok <= n <= MAX_MINRUN.
+   Return -1 if comparison raises an exception, else 0.
    Even in case of error, the output slice will be some permutation of
    the input (nothing is lost or duplicated).
 */
@@ -1684,12 +1683,12 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     Py_ssize_t L, M, R;
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
-     /* assert s[:ok) is sorted */
+    /* assert s[:ok) is sorted */
 
     if (! ok)
         ++ok;
     for (; ok < n; ++ok) {
-s        /* set L to where a[ok] belongs */
+        /* set L to where a[ok] belongs */
         L = 0;
         R = ok;
         pivot = a[ok];

From dfebf4133c82ff233452603f78737119b4023a62 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Sun, 17 Mar 2024 21:52:09 -0500
Subject: [PATCH 03/12] Repair closing braces in comments.

---
 Objects/listobject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index d6233f931ffdb3..fc3154deda08fd 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1683,7 +1683,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     Py_ssize_t L, M, R;
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
-    /* assert s[:ok) is sorted */
+    /* assert s[:ok] is sorted */
 
     if (! ok)
         ++ok;
@@ -1708,7 +1708,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                 L = M + 1;
         } while (L < R);
         assert(L == R);
-        /* a[:L] holds all elements from a[:ok[ <= pivot now, so pivot belongs
+        /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
            at index L. Slide a[L:ok] to the right a slot to make room for it.
            Caution: using memmove is much slower under MSVC 5; we're not
            usually moving many slots. */

From e40cf9510e593214def609aa4f20e35b60e03834 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 13:12:23 -0500
Subject: [PATCH 04/12] Experiments.

Add preprocesor cruft so that you can very easily try either
binary or regular insertion sort. And for binary insertion
sort, switch to a different approach that tries to eliminate the
inner branch by generating conditional move instructions
instead. Alas, Visual Studio will not show me the machine
code generated in a Release build - the compiler and/or linker
apparently optimized it so much it can't relate it to the
source code :-(
---
 Objects/listobject.c | 60 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index fc3154deda08fd..8a68be965d5375 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1680,13 +1680,59 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     PyObject ** const v = ss->values;
     const bool has_values = v != NULL;
     PyObject *pivot;
-    Py_ssize_t L, M, R;
+    Py_ssize_t M;
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
     /* assert s[:ok] is sorted */
-
     if (! ok)
         ++ok;
+    /* Regular insertion sort has average- and worst-case O(n**2) cost
+       for both average and worst cases for both # of comparisons and number
+       of bytes moved. But its branches are highly predictable, and it loves
+       sorted input (O(n) compares and data movementl. This is significant in
+       cases like sortperf.py's %sort, where an out-of-order element near the
+       start of a run is moved into place slowly but then the remaining
+       elements up to length minrun are generally at worst one slot away from
+       their correct position (so only need 1 or 2 commpares to resolve).
+
+       Binary insertion sort has worst, average, and best case O(n log n)
+       cost for # of comparisons, but worst and average case O(n**2) cost
+       for data movement. The more expensive comparisons, the more important
+       the comparison advantage. But its branches are less predictable the
+       more "randomish" the data, and that's so significant its worst case
+       in real life is random input rather than reverse-ordered (which does
+       about twice the data movement than random input does). */
+#if 1 // ordinary insertion sort
+    PyObject * vpivot = NULL;
+    for (; ok < n; ++ok) {
+        pivot = a[ok];
+        if (has_values)
+            vpivot = v[ok];
+        for (M = ok - 1; M >= 0; --M) {
+            k = ISLT(pivot, a[M]);
+            if (k < 0) {
+                a[M + 1] = pivot;
+                if (has_values)
+                    v[M + 1] = vpivot;
+                goto fail;
+            }
+            else if (k) {
+                a[M + 1] = a[M];
+                if (has_values) {
+                    v[M + 1] = v[M];
+                    v[M] = vpivot;
+                }
+            }
+            else
+                break;
+        }
+        a[M + 1] = pivot;
+        if (has_values)
+            v[M + 1] = vpivot;
+    }
+    return 0;
+#else // binary insertion sort
+    Py_ssize_t L, R;
     for (; ok < n; ++ok) {
         /* set L to where a[ok] belongs */
         L = 0;
@@ -1702,10 +1748,19 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             /* don't do silly ;-) things to prevent overflow when finding
                the midpoint; L and R are very far from filling a Py_ssize_t */
             M = (L + R) >> 1;
+#if 0
             IFLT(pivot, a[M])
                 R = M;
             else
                 L = M + 1;
+#else // try to get compiler to generate conditional move instructions
+            k = ISLT(pivot, a[M]);
+            if (k < 0)
+                goto fail;
+            Py_ssize_t Mp1 = M + 1;
+            R = k ? M : R;
+            L = k ? L : Mp1;
+#endif
         } while (L < R);
         assert(L == R);
         /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
@@ -1722,6 +1777,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             v[L] = pivot;
         }
     }
+#endif
     return 0;
 
  fail:

From 3e0fb61be3a3106b103c09d87172f2492d11736f Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 13:24:36 -0500
Subject: [PATCH 05/12] Remove redundant `return 0;`.

---
 Objects/listobject.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 8a68be965d5375..27c499b2fbb147 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1730,7 +1730,6 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         if (has_values)
             v[M + 1] = vpivot;
     }
-    return 0;
 #else // binary insertion sort
     Py_ssize_t L, R;
     for (; ok < n; ++ok) {

From d60ab7445b84a98fe4ab9cc9342089e1cdb6b094 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 13:29:00 -0500
Subject: [PATCH 06/12] Remove unneeded store.

---
 Objects/listobject.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 27c499b2fbb147..2fa98952f26256 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1720,7 +1720,6 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
                 a[M + 1] = a[M];
                 if (has_values) {
                     v[M + 1] = v[M];
-                    v[M] = vpivot;
                 }
             }
             else

From 56b93c1e10f35415ebf8916660dc4bd2090fa75d Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 13:38:41 -0500
Subject: [PATCH 07/12] And remove needless braces.

---
 Objects/listobject.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 2fa98952f26256..dc6aeb98fdb2dc 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1718,9 +1718,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             }
             else if (k) {
                 a[M + 1] = a[M];
-                if (has_values) {
+                if (has_values)
                     v[M + 1] = v[M];
-                }
             }
             else
                 break;

From 268d98d20e6d28e4003c26b49bfcb48844b3cc76 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 14:44:31 -0500
Subject: [PATCH 08/12] Comment repair.

---
 Objects/listobject.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index dc6aeb98fdb2dc..5bfa8248fc6df5 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1687,13 +1687,13 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     if (! ok)
         ++ok;
     /* Regular insertion sort has average- and worst-case O(n**2) cost
-       for both average and worst cases for both # of comparisons and number
-       of bytes moved. But its branches are highly predictable, and it loves
-       sorted input (O(n) compares and data movementl. This is significant in
-       cases like sortperf.py's %sort, where an out-of-order element near the
-       start of a run is moved into place slowly but then the remaining
-       elements up to length minrun are generally at worst one slot away from
-       their correct position (so only need 1 or 2 commpares to resolve).
+       for both # of comparisons and number of bytes moved. But its branches
+       are highly predictable, and it loves sorted input (O(n) compares and
+       data movementl. This is significant in cases like sortperf.py's %sort,
+       where an out-of-order element near the start of a run is moved into
+       place slowly but then the remaining elements up to length minrun are
+       generally at worst one slot away from their correct position (so only
+       need 1 or 2 commpares to resolve).
 
        Binary insertion sort has worst, average, and best case O(n log n)
        cost for # of comparisons, but worst and average case O(n**2) cost
@@ -2587,10 +2587,10 @@ merge_force_collapse(MergeState *ms)
 /* Compute a good value for the minimum run length; natural runs shorter
  * than this are boosted artificially via binary insertion.
  *
- * If n < 64, return n (it's too small to bother with fancy stuff).
- * Else if n is an exact power of 2, return 32.
- * Else return an int k, 32 <= k <= 64, such that n/k is close to, but
- * strictly less than, an exact power of 2.
+ * If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
+ * Else if n is an exact power of 2, return MAX_MINRUN / 2.
+ * Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
+ * close to, but strictly less than, an exact power of 2.
  *
  * See listsort.txt for more info.
  */

From 1a20c11629c8e306fff6e572c799f62b4c8ae638 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 15:52:45 -0500
Subject: [PATCH 09/12] Modify to account for new MAX_MINRUN #define.

---
 Objects/listsort.txt | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Objects/listsort.txt b/Objects/listsort.txt
index 4f84e2c87da7f1..f387d9c116e502 100644
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@@ -270,9 +270,9 @@ result.  This has two primary good effects:
 
 Computing minrun
 ----------------
-If N < 64, minrun is N.  IOW, binary insertion sort is used for the whole
-array then; it's hard to beat that given the overheads of trying something
-fancier (see note BINSORT).
+If N < MAX_MINRUN, minrun is N.  IOW, binary insertion sort is used for the 
+whole array then; it's hard to beat that given the overheads of trying 
+something fancier (see note BINSORT).
 
 When N is a power of 2, testing on random data showed that minrun values of
 16, 32, 64 and 128 worked about equally well.  At 256 the data-movement cost
@@ -310,12 +310,13 @@ place, and r < minrun is small compared to N), or q a little larger than a
 power of 2 regardless of r (then we've got a case similar to "2112", again
 leaving too little work for the last merge to do).
 
-Instead we pick a minrun in range(32, 65) such that N/minrun is exactly a
-power of 2, or if that isn't possible, is close to, but strictly less than,
-a power of 2.  This is easier to do than it may sound:  take the first 6
-bits of N, and add 1 if any of the remaining bits are set.  In fact, that
-rule covers every case in this section, including small N and exact powers
-of 2; merge_compute_minrun() is a deceptively simple function.
+Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that 
+N/minrun is exactly a power of 2, or if that isn't possible, is close to, but 
+strictly less than, a power of 2.  This is easier to do than it may sound: 
+take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining 
+bits are set. In fact, that rule covers every case in this section, including 
+small N and exact powers of 2; merge_compute_minrun() is a deceptively simple 
+function.
 
 
 The Merge Pattern

From 68b474d69151cc6e8147e692aa0254fa985e71f1 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 18:33:39 -0500
Subject: [PATCH 10/12] More comment repairs.

---
 Objects/listobject.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 09999b9257dff6..108e8d1edf2659 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1683,12 +1683,12 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     Py_ssize_t M;
 
     assert(0 <= ok && ok <= n && 1 <= n && n <= MAX_MINRUN);
-    /* assert s[:ok] is sorted */
+    /* assert a[:ok] is sorted */
     if (! ok)
         ++ok;
     /* Regular insertion sort has average- and worst-case O(n**2) cost
        for both # of comparisons and number of bytes moved. But its branches
-       are highly predictable, and it loves sorted input (O(n) compares and
+       are highly predictable, and it loves sorted input (n-1 compares and no
        data movementl. This is significant in cases like sortperf.py's %sort,
        where an out-of-order element near the start of a run is moved into
        place slowly but then the remaining elements up to length minrun are
@@ -1736,9 +1736,9 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         R = ok;
         pivot = a[ok];
         /* Slice invariants. vacuously true at the start:
-         * 0:L  <= pivot
-         * L:R     unknown
-         * R:ok  > pivot
+         * all a[0:L]  <= pivot
+         * all a[L:R]     unknown
+         * all a[R:ok]  > pivot
          */
         assert(L < R);
         do {

From e0a68379e2c98c6439b4dfdd29d445cae277c612 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Mon, 18 Mar 2024 19:20:25 -0500
Subject: [PATCH 11/12] comment repair

---
 Objects/listobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 108e8d1edf2659..044c2389134c4e 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1689,7 +1689,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
     /* Regular insertion sort has average- and worst-case O(n**2) cost
        for both # of comparisons and number of bytes moved. But its branches
        are highly predictable, and it loves sorted input (n-1 compares and no
-       data movementl. This is significant in cases like sortperf.py's %sort,
+       data movement). This is significant in cases like sortperf.py's %sort,
        where an out-of-order element near the start of a run is moved into
        place slowly but then the remaining elements up to length minrun are
        generally at worst one slot away from their correct position (so only

From 8674829d7926c2ae833536b57acfbfa47ea2e9e4 Mon Sep 17 00:00:00 2001
From: Tim Peters <tim.peters@gmail.com>
Date: Thu, 21 Mar 2024 13:28:18 -0500
Subject: [PATCH 12/12] Mostly fleshing out comments. Fiddled the #if's to
 leave binary insertion the method of choice, and with the original branch
 structure. Alternatives are left in the code, but #if'ed out.

---
 Objects/listobject.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/Objects/listobject.c b/Objects/listobject.c
index 7e74951a4df1ac..470ad8eb8135db 100644
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@@ -1726,7 +1726,10 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
        where an out-of-order element near the start of a run is moved into
        place slowly but then the remaining elements up to length minrun are
        generally at worst one slot away from their correct position (so only
-       need 1 or 2 commpares to resolve).
+       need 1 or 2 commpares to resolve). If comparisons are very fast (such
+       as for a list of Python floats), the simple inner loop leaves it
+       very competitive with binary insertion, despite that it does
+       significantly more compares overall on random data.
 
        Binary insertion sort has worst, average, and best case O(n log n)
        cost for # of comparisons, but worst and average case O(n**2) cost
@@ -1734,8 +1737,12 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
        the comparison advantage. But its branches are less predictable the
        more "randomish" the data, and that's so significant its worst case
        in real life is random input rather than reverse-ordered (which does
-       about twice the data movement than random input does). */
-#if 1 // ordinary insertion sort
+       about twice the data movement than random input does).
+
+       Note that the number of bytes moved doesn't seem to matter. MAX_MINRUN
+       of 64 is so small that the key and value pointers all fit in a corner
+       of L1 cache, and moving things around in that is very fast. */
+#if 0 // ordinary insertion sort.
     PyObject * vpivot = NULL;
     for (; ok < n; ++ok) {
         pivot = a[ok];
@@ -1778,12 +1785,17 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             /* don't do silly ;-) things to prevent overflow when finding
                the midpoint; L and R are very far from filling a Py_ssize_t */
             M = (L + R) >> 1;
-#if 0
+#if 1 // straightforward, but highly unpredictable branch on random data
             IFLT(pivot, a[M])
                 R = M;
             else
                 L = M + 1;
-#else // try to get compiler to generate conditional move instructions
+#else
+            /* Try to get compiler to generate conditional move instructions
+               instead. Works fine, but leaving it disabled for now because
+               it's not yielding consistently faster sorts. Needs more
+               investigation. More computation in the inner loop adds its own
+               costs, which can be significant when compares are fast. */
             k = ISLT(pivot, a[M]);
             if (k < 0)
                 goto fail;
@@ -1796,7 +1808,8 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
         /* a[:L] holds all elements from a[:ok] <= pivot now, so pivot belongs
            at index L. Slide a[L:ok] to the right a slot to make room for it.
            Caution: using memmove is much slower under MSVC 5; we're not
-           usually moving many slots. */
+           usually moving many slots. Years later: under Visual Studio 2022,
+           memmove seems just slightly slower than doing it "by hand". */
         for (M = ok; M > L; --M)
             a[M] = a[M - 1];
         a[L] = pivot;
@@ -1807,7 +1820,7 @@ binarysort(MergeState *ms, const sortslice *ss, Py_ssize_t n, Py_ssize_t ok)
             v[L] = pivot;
         }
     }
-#endif
+#endif // pick binary or regular insertion sort
     return 0;
 
  fail: