From 357fdd48710c72c4adc282e9adda160568f5b889 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 Apr 2023 11:30:34 +0200
Subject: [PATCH] Load exceptions last in Tokenizer.from_bytes (#12553)

In `Tokenizer.from_bytes`, the exceptions should be loaded last so that
they are only processed once as part of loading the model.

The exceptions are tokenized as phrase matcher patterns in the
background and the internal tokenization needs to be synced with all the
remaining tokenizer settings. If the exceptions are not loaded last,
there are speed regressions for `Tokenizer.from_bytes/disk` vs.
`Tokenizer.add_special_case` as the caches are reloaded more than
necessary during deserialization.
---
 spacy/tokenizer.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 0e75b5f7a66..a4a68ae8ef3 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -834,10 +834,12 @@ cdef class Tokenizer:
             self.token_match = re.compile(data["token_match"]).match
         if "url_match" in data and isinstance(data["url_match"], str):
             self.url_match = re.compile(data["url_match"]).match
-        if "rules" in data and isinstance(data["rules"], dict):
-            self.rules = data["rules"]
         if "faster_heuristics" in data:
             self.faster_heuristics = data["faster_heuristics"]
+        # always load rules last so that all other settings are set before the
+        # internal tokenization for the phrase matcher
+        if "rules" in data and isinstance(data["rules"], dict):
+            self.rules = data["rules"]
         return self