From 4025110d9db5507e1804c4d297cb2fc821ca95c4 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sun, 4 Aug 2019 02:23:18 -0700 Subject: [PATCH 1/5] Fix a broken link in a comment in is_normalized. This link doesn't work. Going back through that UAX's history to find the version that was current when this code was added in commit 7a0fedfd1 in 2009-04, we find that that anchor still works in that version: https://www.unicode.org/reports/tr15/tr15-29.html#Annex8 It's a section heading "14. Detecting Normalization Forms". Happily the anchor that the corresponding section heading now offers looks much more reasonable -- it's the title of the section -- and so likely to be long-term stable. ("Annex 8" seems like some kind of editing error.) Switch to that. --- Modules/unicodedata.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index ae0d4e46f9a409..7ced3abdab6ff4 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -791,8 +791,12 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) if (self && UCD_Check(self)) return NO; - /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, - as described in http://unicode.org/reports/tr15/#Annex8. */ + /* This is an implementation of the following algorithm: + https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + See there for background. + */ + + /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */ quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); i = 0; From 2a222da9638b77a014ba388ebccd1a0e9c9606af Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sun, 4 Aug 2019 02:39:52 -0700 Subject: [PATCH 2/5] bpo-37966: Fully implement the UAX #15 quick-check algorithm. The purpose of the `unicodedata.is_normalized` function is to answer the question `str == unicodedata.normalized(form, str)` more efficiently than writing just that, by using the "quick check" optimization described in the Unicode standard in UAX #15. However, it turns out the code doesn't implement the full algorithm from the standard, and as a result we often miss the optimization and end up having to compute the whole normalized string after all. Implement the standard's algorithm. This greatly speeds up `unicodedata.is_normalized` in many cases where our partial variant of quick-check had been returning MAYBE and the standard algorithm returns NO. At a quick test on my desktop, the existing code takes about 4.4 ms/MB (so 4.4 ns per byte) when the partial quick-check returns MAYBE and it has to do the slow normalize-and-compare: $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 50 loops, best of 5: 4.39 msec per loop With this patch, it gets the answer instantly (58 ns) on the same 1 MB string: $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \ -- 'unicodedata.is_normalized("NFD", s)' 5000000 loops, best of 5: 58.2 nsec per loop --- Doc/whatsnew/3.8.rst | 5 +- Lib/test/test_unicodedata.py | 2 + .../2019-08-27-21-21-36.bpo-37966.5OBLez.rst | 3 + Modules/unicodedata.c | 65 +++++++++++-------- 4 files changed, 47 insertions(+), 28 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index cd31cf6db6e897..4dc2eab7917771 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -1090,8 +1090,9 @@ unicodedata `_ release. * New function :func:`~unicodedata.is_normalized` can be used to verify a string - is in a specific normal form. (Contributed by Max Belanger and David Euresti in - :issue:`32285`). + is in a specific normal form, often much faster than by actually normalizing + the string. (Contributed by Max Belanger, David Euresti, and Greg Price in + :issue:`32285` and :issue:`37966`). unittest diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 7bc196be362b32..9ec2f11497268a 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -208,6 +208,8 @@ def test_issue29456(self): self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) + # For tests of unicodedata.is_normalized / self.db.is_normalized , + # see test_normalization.py . def test_east_asian_width(self): eaw = self.db.east_asian_width diff --git a/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst new file mode 100644 index 00000000000000..6b9d69c5b3a9a4 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2019-08-27-21-21-36.bpo-37966.5OBLez.rst @@ -0,0 +1,3 @@ +The implementation of :func:`~unicodedata.is_normalized` has been greatly +sped up on strings that aren't normalized, by implementing the full +normalization-quick-check algorithm from the Unicode standard. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 7ced3abdab6ff4..61451dad49b2ec 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -775,29 +775,37 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -typedef enum {YES, NO, MAYBE} NormalMode; - -/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ -static NormalMode -is_normalized(PyObject *self, PyObject *input, int nfc, int k) +// This needs to match the logic in makeunicodedata.py +// which constructs the quickcheck data. +typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; + +/* Run the Unicode normalization "quickcheck" algorithm. + * + * Return YES or NO if quickcheck determines the input is certainly + * normalized or certainly not, and MAYBE if quickcheck is unable to + * tell. */ +static QuickcheckResult +is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k) { - Py_ssize_t i, len; - int kind; - void *data; - unsigned char prev_combining = 0, quickcheck_mask; + /* This is an implementation of the following algorithm: + https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms + See there for background. + */ /* An older version of the database is requested, quickchecks must be disabled. */ if (self && UCD_Check(self)) return NO; - /* This is an implementation of the following algorithm: - https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms - See there for background. - */ + Py_ssize_t i, len; + int kind; + void *data; + unsigned char prev_combining = 0; - /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */ - quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); + /* The two quickcheck bits at this shift have type QuickcheckResult. */ + int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0); + + QuickcheckResult result = YES; /* certainly normalized, unless we find something */ i = 0; kind = PyUnicode_KIND(input); @@ -806,16 +814,21 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) while (i < len) { Py_UCS4 ch = PyUnicode_READ(kind, data, i++); const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); - unsigned char combining = record->combining; - unsigned char quickcheck = record->normalization_quick_check; - if (quickcheck & quickcheck_mask) - return MAYBE; /* this string might need normalization */ + unsigned char combining = record->combining; if (combining && prev_combining > combining) return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; + + unsigned char quickcheck = record->normalization_quick_check; + switch ((quickcheck >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } } - return YES; /* certainly normalized */ + return result; } /*[clinic input] @@ -848,7 +861,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, PyObject *result; int nfc = 0; int k = 0; - NormalMode m; + QuickcheckResult m; PyObject *cmp; int match = 0; @@ -871,7 +884,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return NULL; } - m = is_normalized(self, input, nfc, k); + m = is_normalized_quickcheck(self, input, nfc, k); if (m == MAYBE) { cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); @@ -917,28 +930,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized(self, input, 1, 0) == YES) { + if (is_normalized_quickcheck(self, input, 1, 0) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized(self, input, 1, 1) == YES) { + if (is_normalized_quickcheck(self, input, 1, 1) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized(self, input, 0, 0) == YES) { + if (is_normalized_quickcheck(self, input, 0, 0) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized(self, input, 0, 1) == YES) { + if (is_normalized_quickcheck(self, input, 0, 1) == YES) { Py_INCREF(input); return input; } From 26892d32e64e03e55cfc8d772852cb054b35859f Mon Sep 17 00:00:00 2001 From: Greg Price Date: Sat, 24 Aug 2019 19:13:21 -0700 Subject: [PATCH 3/5] bpo-37966: Add yes_only flag to `is_normalized` helper. This restores a small optimization that the original version of this code had for the `unicodedata.normalize` use case. With this, that case is actually faster than in master! $ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)' 500 loops, best of 5: 561 usec per loop $ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \ -- 'unicodedata.normalize("NFD", s)' 500 loops, best of 5: 512 usec per loop --- Modules/unicodedata.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 61451dad49b2ec..9951025495889c 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -783,9 +783,14 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; * * Return YES or NO if quickcheck determines the input is certainly * normalized or certainly not, and MAYBE if quickcheck is unable to - * tell. */ + * tell. + * + * If `yes_only` is true, then return MAYBE as soon as we determine + * the answer is not YES. + */ static QuickcheckResult -is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k) +is_normalized_quickcheck(PyObject *self, PyObject *input, + int nfc, int k, int yes_only) { /* This is an implementation of the following algorithm: https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms @@ -820,12 +825,17 @@ is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k) return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; - unsigned char quickcheck = record->normalization_quick_check; - switch ((quickcheck >> quickcheck_shift) & 3) { - case NO: - return NO; - case MAYBE: - result = MAYBE; /* this string might need normalization */ + unsigned char quickcheck_whole = record->normalization_quick_check; + if (yes_only) { + if (quickcheck_whole & (3 << quickcheck_shift)) + return MAYBE; + } else { + switch ((quickcheck_whole >> quickcheck_shift) & 3) { + case NO: + return NO; + case MAYBE: + result = MAYBE; /* this string might need normalization */ + } } } return result; @@ -884,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return NULL; } - m = is_normalized_quickcheck(self, input, nfc, k); + m = is_normalized_quickcheck(self, input, nfc, k, 0); if (m == MAYBE) { cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); @@ -930,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized_quickcheck(self, input, 1, 0) == YES) { + if (is_normalized_quickcheck(self, input, 1, 0, 1) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized_quickcheck(self, input, 1, 1) == YES) { + if (is_normalized_quickcheck(self, input, 1, 1, 1) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized_quickcheck(self, input, 0, 0) == YES) { + if (is_normalized_quickcheck(self, input, 0, 0, 1) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized_quickcheck(self, input, 0, 1) == YES) { + if (is_normalized_quickcheck(self, input, 0, 1, 1) == YES) { Py_INCREF(input); return input; } From 27e8122e1b28bf994d85eaa5283c5aad7c475de3 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Wed, 28 Aug 2019 20:43:41 -0700 Subject: [PATCH 4/5] Move UAX #15 link to doc-comment. --- Modules/unicodedata.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 9951025495889c..dac0afb36be3a8 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -787,16 +787,14 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; * * If `yes_only` is true, then return MAYBE as soon as we determine * the answer is not YES. + * + * For background and details on the algorithm, see UAX #15: + * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms */ static QuickcheckResult is_normalized_quickcheck(PyObject *self, PyObject *input, int nfc, int k, int yes_only) { - /* This is an implementation of the following algorithm: - https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms - See there for background. - */ - /* An older version of the database is requested, quickchecks must be disabled. */ if (self && UCD_Check(self)) From 37627870c1dd173cc5da777e8aef629378b56996 Mon Sep 17 00:00:00 2001 From: Greg Price Date: Wed, 28 Aug 2019 20:46:32 -0700 Subject: [PATCH 5/5] Use `bool` for a boolean. --- Modules/unicodedata.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index dac0afb36be3a8..5e8ba602d66848 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,8 @@ #include "ucnhash.h" #include "structmember.h" +#include + _Py_IDENTIFIER(NFC); _Py_IDENTIFIER(NFD); _Py_IDENTIFIER(NFKC); @@ -793,7 +795,7 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; */ static QuickcheckResult is_normalized_quickcheck(PyObject *self, PyObject *input, - int nfc, int k, int yes_only) + int nfc, int k, bool yes_only) { /* An older version of the database is requested, quickchecks must be disabled. */ @@ -892,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return NULL; } - m = is_normalized_quickcheck(self, input, nfc, k, 0); + m = is_normalized_quickcheck(self, input, nfc, k, false); if (m == MAYBE) { cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); @@ -938,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized_quickcheck(self, input, 1, 0, 1) == YES) { + if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized_quickcheck(self, input, 1, 1, 1) == YES) { + if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized_quickcheck(self, input, 0, 0, 1) == YES) { + if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized_quickcheck(self, input, 0, 1, 1) == YES) { + if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) { Py_INCREF(input); return input; }