From 4025110d9db5507e1804c4d297cb2fc821ca95c4 Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Sun, 4 Aug 2019 02:23:18 -0700
Subject: [PATCH 1/5] Fix a broken link in a comment in is_normalized.

This link doesn't work.

Going back through that UAX's history to find the version that was
current when this code was added in commit 7a0fedfd1 in 2009-04,
we find that that anchor still works in that version:
  https://www.unicode.org/reports/tr15/tr15-29.html#Annex8

It's a section heading "14. Detecting Normalization Forms".  Happily
the anchor that the corresponding section heading now offers looks
much more reasonable -- it's the title of the section -- and so likely
to be long-term stable.  ("Annex 8" seems like some kind of editing
error.)  Switch to that.
---
 Modules/unicodedata.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index ae0d4e46f9a409..7ced3abdab6ff4 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -791,8 +791,12 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
     if (self && UCD_Check(self))
         return NO;
 
-    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
-       as described in http://unicode.org/reports/tr15/#Annex8. */
+    /* This is an implementation of the following algorithm:
+       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
+       See there for background.
+    */
+
+    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */
     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 
     i = 0;

From 2a222da9638b77a014ba388ebccd1a0e9c9606af Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Sun, 4 Aug 2019 02:39:52 -0700
Subject: [PATCH 2/5] bpo-37966: Fully implement the UAX #15 quick-check
 algorithm.

The purpose of the `unicodedata.is_normalized` function is to answer
the question `str == unicodedata.normalized(form, str)` more
efficiently than writing just that, by using the "quick check"
optimization described in the Unicode standard in UAX #15.

However, it turns out the code doesn't implement the full algorithm
from the standard, and as a result we often miss the optimization and
end up having to compute the whole normalized string after all.

Implement the standard's algorithm.  This greatly speeds up
`unicodedata.is_normalized` in many cases where our partial variant
of quick-check had been returning MAYBE and the standard algorithm
returns NO.

At a quick test on my desktop, the existing code takes about 4.4 ms/MB
(so 4.4 ns per byte) when the partial quick-check returns MAYBE and it
has to do the slow normalize-and-compare:

  $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  50 loops, best of 5: 4.39 msec per loop

With this patch, it gets the answer instantly (58 ns) on the same 1 MB
string:

  $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  5000000 loops, best of 5: 58.2 nsec per loop
---
 Doc/whatsnew/3.8.rst                          |  5 +-
 Lib/test/test_unicodedata.py                  |  2 +
 .../2019-08-27-21-21-36.bpo-37966.5OBLez.rst  |  3 +
 Modules/unicodedata.c                         | 65 +++++++++++--------
 4 files changed, 47 insertions(+), 28 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst

diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst
index cd31cf6db6e897..4dc2eab7917771 100644
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@@ -1090,8 +1090,9 @@ unicodedata
   <http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
 
 * New function :func:`~unicodedata.is_normalized` can be used to verify a string
-  is in a specific normal form. (Contributed by Max Belanger and David Euresti in
-  :issue:`32285`).
+  is in a specific normal form, often much faster than by actually normalizing
+  the string.  (Contributed by Max Belanger, David Euresti, and Greg Price in
+  :issue:`32285` and :issue:`37966`).
 
 
 unittest
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index 7bc196be362b32..9ec2f11497268a 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -208,6 +208,8 @@ def test_issue29456(self):
         self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
         self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
 
+    # For tests of unicodedata.is_normalized / self.db.is_normalized ,
+    # see test_normalization.py .
 
     def test_east_asian_width(self):
         eaw = self.db.east_asian_width
diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst
new file mode 100644
index 00000000000000..6b9d69c5b3a9a4
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst	
@@ -0,0 +1,3 @@
+The implementation of :func:`~unicodedata.is_normalized` has been greatly
+sped up on strings that aren't normalized, by implementing the full
+normalization-quick-check algorithm from the Unicode standard.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 7ced3abdab6ff4..61451dad49b2ec 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -775,29 +775,37 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
     return result;
 }
 
-typedef enum {YES, NO, MAYBE} NormalMode;
-
-/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
-static NormalMode
-is_normalized(PyObject *self, PyObject *input, int nfc, int k)
+// This needs to match the logic in makeunicodedata.py
+// which constructs the quickcheck data.
+typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
+
+/* Run the Unicode normalization "quickcheck" algorithm.
+ *
+ * Return YES or NO if quickcheck determines the input is certainly
+ * normalized or certainly not, and MAYBE if quickcheck is unable to
+ * tell. */
+static QuickcheckResult
+is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k)
 {
-    Py_ssize_t i, len;
-    int kind;
-    void *data;
-    unsigned char prev_combining = 0, quickcheck_mask;
+    /* This is an implementation of the following algorithm:
+       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
+       See there for background.
+    */
 
     /* An older version of the database is requested, quickchecks must be
        disabled. */
     if (self && UCD_Check(self))
         return NO;
 
-    /* This is an implementation of the following algorithm:
-       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
-       See there for background.
-    */
+    Py_ssize_t i, len;
+    int kind;
+    void *data;
+    unsigned char prev_combining = 0;
 
-    /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */
-    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+    /* The two quickcheck bits at this shift have type QuickcheckResult. */
+    int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
+
+    QuickcheckResult result = YES; /* certainly normalized, unless we find something */
 
     i = 0;
     kind = PyUnicode_KIND(input);
@@ -806,16 +814,21 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
     while (i < len) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
-        unsigned char combining = record->combining;
-        unsigned char quickcheck = record->normalization_quick_check;
 
-        if (quickcheck & quickcheck_mask)
-            return MAYBE; /* this string might need normalization */
+        unsigned char combining = record->combining;
         if (combining && prev_combining > combining)
             return NO; /* non-canonical sort order, not normalized */
         prev_combining = combining;
+
+        unsigned char quickcheck = record->normalization_quick_check;
+        switch ((quickcheck >> quickcheck_shift) & 3) {
+        case NO:
+          return NO;
+        case MAYBE:
+          result = MAYBE; /* this string might need normalization */
+        }
     }
-    return YES; /* certainly normalized */
+    return result;
 }
 
 /*[clinic input]
@@ -848,7 +861,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
     PyObject *result;
     int nfc = 0;
     int k = 0;
-    NormalMode m;
+    QuickcheckResult m;
 
     PyObject *cmp;
     int match = 0;
@@ -871,7 +884,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
         return NULL;
     }
 
-    m = is_normalized(self, input, nfc, k);
+    m = is_normalized_quickcheck(self, input, nfc, k);
 
     if (m == MAYBE) {
         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -917,28 +930,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     }
 
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
-        if (is_normalized(self, input, 1, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 0) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
-        if (is_normalized(self, input, 1, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 1);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
-        if (is_normalized(self, input, 0, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 0) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfd_nfkd(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
-        if (is_normalized(self, input, 0, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 1) == YES) {
             Py_INCREF(input);
             return input;
         }

From 26892d32e64e03e55cfc8d772852cb054b35859f Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Sat, 24 Aug 2019 19:13:21 -0700
Subject: [PATCH 3/5] bpo-37966: Add yes_only flag to `is_normalized` helper.

This restores a small optimization that the original version of this
code had for the `unicodedata.normalize` use case.

With this, that case is actually faster than in master!

$ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 561 usec per loop

$ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 512 usec per loop
---
 Modules/unicodedata.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 61451dad49b2ec..9951025495889c 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -783,9 +783,14 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
  *
  * Return YES or NO if quickcheck determines the input is certainly
  * normalized or certainly not, and MAYBE if quickcheck is unable to
- * tell. */
+ * tell.
+ *
+ * If `yes_only` is true, then return MAYBE as soon as we determine
+ * the answer is not YES.
+ */
 static QuickcheckResult
-is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k)
+is_normalized_quickcheck(PyObject *self, PyObject *input,
+                         int nfc, int k, int yes_only)
 {
     /* This is an implementation of the following algorithm:
        https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
@@ -820,12 +825,17 @@ is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k)
             return NO; /* non-canonical sort order, not normalized */
         prev_combining = combining;
 
-        unsigned char quickcheck = record->normalization_quick_check;
-        switch ((quickcheck >> quickcheck_shift) & 3) {
-        case NO:
-          return NO;
-        case MAYBE:
-          result = MAYBE; /* this string might need normalization */
+        unsigned char quickcheck_whole = record->normalization_quick_check;
+        if (yes_only) {
+            if (quickcheck_whole & (3 << quickcheck_shift))
+                return MAYBE;
+        } else {
+            switch ((quickcheck_whole >> quickcheck_shift) & 3) {
+            case NO:
+              return NO;
+            case MAYBE:
+              result = MAYBE; /* this string might need normalization */
+            }
         }
     }
     return result;
@@ -884,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
         return NULL;
     }
 
-    m = is_normalized_quickcheck(self, input, nfc, k);
+    m = is_normalized_quickcheck(self, input, nfc, k, 0);
 
     if (m == MAYBE) {
         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -930,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     }
 
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
-        if (is_normalized_quickcheck(self, input, 1, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 0, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
-        if (is_normalized_quickcheck(self, input, 1, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 1, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 1);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
-        if (is_normalized_quickcheck(self, input, 0, 0) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 0, 1) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfd_nfkd(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
-        if (is_normalized_quickcheck(self, input, 0, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 1, 1) == YES) {
             Py_INCREF(input);
             return input;
         }

From 27e8122e1b28bf994d85eaa5283c5aad7c475de3 Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Wed, 28 Aug 2019 20:43:41 -0700
Subject: [PATCH 4/5] Move UAX #15 link to doc-comment.

---
 Modules/unicodedata.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 9951025495889c..dac0afb36be3a8 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -787,16 +787,14 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
  *
  * If `yes_only` is true, then return MAYBE as soon as we determine
  * the answer is not YES.
+ *
+ * For background and details on the algorithm, see UAX #15:
+ *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
  */
 static QuickcheckResult
 is_normalized_quickcheck(PyObject *self, PyObject *input,
                          int nfc, int k, int yes_only)
 {
-    /* This is an implementation of the following algorithm:
-       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
-       See there for background.
-    */
-
     /* An older version of the database is requested, quickchecks must be
        disabled. */
     if (self && UCD_Check(self))

From 37627870c1dd173cc5da777e8aef629378b56996 Mon Sep 17 00:00:00 2001
From: Greg Price <gnprice@gmail.com>
Date: Wed, 28 Aug 2019 20:46:32 -0700
Subject: [PATCH 5/5] Use `bool` for a boolean.

---
 Modules/unicodedata.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index dac0afb36be3a8..5e8ba602d66848 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -19,6 +19,8 @@
 #include "ucnhash.h"
 #include "structmember.h"
 
+#include <stdbool.h>
+
 _Py_IDENTIFIER(NFC);
 _Py_IDENTIFIER(NFD);
 _Py_IDENTIFIER(NFKC);
@@ -793,7 +795,7 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
  */
 static QuickcheckResult
 is_normalized_quickcheck(PyObject *self, PyObject *input,
-                         int nfc, int k, int yes_only)
+                         int nfc, int k, bool yes_only)
 {
     /* An older version of the database is requested, quickchecks must be
        disabled. */
@@ -892,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
         return NULL;
     }
 
-    m = is_normalized_quickcheck(self, input, nfc, k, 0);
+    m = is_normalized_quickcheck(self, input, nfc, k, false);
 
     if (m == MAYBE) {
         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
@@ -938,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     }
 
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
-        if (is_normalized_quickcheck(self, input, 1, 0, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
-        if (is_normalized_quickcheck(self, input, 1, 1, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfc_nfkc(self, input, 1);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
-        if (is_normalized_quickcheck(self, input, 0, 0, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
             Py_INCREF(input);
             return input;
         }
         return nfd_nfkd(self, input, 0);
     }
     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
-        if (is_normalized_quickcheck(self, input, 0, 1, 1) == YES) {
+        if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
             Py_INCREF(input);
             return input;
         }