From f64dbe6925cbdb743615ada84eb0f65950c61865 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Wed, 2 Nov 2022 17:40:04 +0100
Subject: [PATCH 01/10] Add `training.before_update` callback

This callback can be used to implement training paradigms like gradual (un)freezing of components (e.g: the Transformer) after a certain number of training steps to mitigate catastrophic forgetting during fine-tuning.
---
 spacy/schemas.py                 | 1 +
 spacy/training/loop.py           | 5 +++++
 website/docs/api/data-formats.md | 1 +
 3 files changed, 7 insertions(+)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index c824d76b90c..174409cc94f 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -329,6 +329,7 @@ class ConfigSchemaTraining(BaseModel):
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
     annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
     before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
+    before_update: Optional[Callable[["Language", int], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
     # fmt: on
 
     class Config:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 06372cbb01c..844446bc07b 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -59,6 +59,7 @@ def train(
     batcher = T["batcher"]
     train_logger = T["logger"]
     before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+    before_update = T["before_update"]
 
     # Helper function to save checkpoints. This is a closure for convenience,
     # to avoid passing in all the args all the time.
@@ -89,6 +90,7 @@ def save_checkpoint(is_best):
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
         annotating_components=annotating_components,
+        before_update=before_update,
     )
     clean_output_dir(output_path)
     stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
@@ -150,6 +152,7 @@ def train_while_improving(
     max_steps: int,
     exclude: List[str],
     annotating_components: List[str],
+    before_update: Optional[Callable[[Language, int], None]],
 ):
     """Train until an evaluation stops improving. Works as a generator,
     with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -198,6 +201,8 @@ def train_while_improving(
     words_seen = 0
     start_time = timer()
     for step, (epoch, batch) in enumerate(train_data):
+        if before_update:
+            before_update(nlp, step)
         dropout = next(dropouts)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index ce06c4ea85e..bdfb1edc02e 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -186,6 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `accumulate_gradient`                                | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | `batcher`                                            | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | `before_to_disk`                                     | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and the current step. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, int], None]]~~                                                                                              |
 | `dev_corpus`                                         | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | `dropout`                                            | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | `eval_frequency`                                     | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |

From 43e5fb7b6b3322d21d06fa81bdd8f9efb0800c14 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Wed, 2 Nov 2022 18:39:52 +0100
Subject: [PATCH 02/10] Fix type annotation, default config value

---
 spacy/default_config.cfg | 2 ++
 spacy/training/loop.py   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 86a72926e30..694fb732f43 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 # Optional callback before nlp object is saved to disk after training
 before_to_disk = null
+# Optional callback that is invoked at the start of each training step
+before_update = null
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 844446bc07b..e8c03d9a8c4 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -152,7 +152,7 @@ def train_while_improving(
     max_steps: int,
     exclude: List[str],
     annotating_components: List[str],
-    before_update: Optional[Callable[[Language, int], None]],
+    before_update: Optional[Callable[["Language", int], None]],
 ):
     """Train until an evaluation stops improving. Works as a generator,
     with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,

From e477eb2f7f3a22898f7a17c8844a50d6a3954700 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Mon, 7 Nov 2022 11:29:04 +0100
Subject: [PATCH 03/10] Generalize arguments passed to the callback

---
 spacy/training/loop.py           | 5 +++--
 website/docs/api/data-formats.md | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index e8c03d9a8c4..25b0df2fccc 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -152,7 +152,7 @@ def train_while_improving(
     max_steps: int,
     exclude: List[str],
     annotating_components: List[str],
-    before_update: Optional[Callable[["Language", int], None]],
+    before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
 ):
     """Train until an evaluation stops improving. Works as a generator,
     with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -202,7 +202,8 @@ def train_while_improving(
     start_time = timer()
     for step, (epoch, batch) in enumerate(train_data):
         if before_update:
-            before_update(nlp, step)
+            before_update_args = {"current_step": step}
+            before_update(nlp, before_update_args)
         dropout = next(dropouts)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index bdfb1edc02e..111d772649b 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `accumulate_gradient`                                | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | `batcher`                                            | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | `before_to_disk`                                     | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and the current step. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, int], None]]~~                                                                                              |
+| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `current_step`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~                                          |
 | `dev_corpus`                                         | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | `dropout`                                            | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | `eval_frequency`                                     | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |

From 1f0863c886599fe4e8b24387afc99de18d039778 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Mon, 7 Nov 2022 11:29:23 +0100
Subject: [PATCH 04/10] Update schema

---
 spacy/schemas.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/schemas.py b/spacy/schemas.py
index 174409cc94f..e48fe170207 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -329,7 +329,7 @@ class ConfigSchemaTraining(BaseModel):
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
     annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
     before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
-    before_update: Optional[Callable[["Language", int], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
+    before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
     # fmt: on
 
     class Config:

From e19b490d75d50f95ddcec46ddcf83b062dcff523 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Wed, 9 Nov 2022 13:55:32 +0100
Subject: [PATCH 05/10] Pass `epoch` to callback, rename `current_step` to
 `step`

---
 spacy/training/loop.py           | 2 +-
 website/docs/api/data-formats.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 25b0df2fccc..8852577725e 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -202,7 +202,7 @@ def train_while_improving(
     start_time = timer()
     for step, (epoch, batch) in enumerate(train_data):
         if before_update:
-            before_update_args = {"current_step": step}
+            before_update_args = {"step": step, "epoch": epoch}
             before_update(nlp, before_update_args)
         dropout = next(dropouts)  # type: ignore
         for subbatch in subdivide_batch(batch, accumulate_gradient):
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 111d772649b..768844cf34c 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `accumulate_gradient`                                | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
 | `batcher`                                            | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
 | `before_to_disk`                                     | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `current_step`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~                                          |
+| `before_update`                                      | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~                                         |
 | `dev_corpus`                                         | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
 | `dropout`                                            | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
 | `eval_frequency`                                     | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |

From aa921ff130dde42ba84faff16f95935531ea6542 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Fri, 18 Nov 2022 12:55:03 +0100
Subject: [PATCH 06/10] Add test

---
 spacy/tests/conftest.py                  |   7 ++
 spacy/tests/training/test_training.py    |  92 ++++++++++++++++++++++-
 spacy/tests/training/toy-en-corpus.spacy | Bin 0 -> 2703 bytes
 3 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/training/toy-en-corpus.spacy

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 0fc74243da3..c17fde0e87b 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import pytest
 from spacy.util import get_lang_class
 from hypothesis import settings
@@ -47,6 +48,12 @@ def getopt(opt):
             pytest.skip("not referencing any issues")
 
 
+@pytest.fixture
+def test_dir(request):
+    print(request.fspath)
+    return Path(request.fspath).parent
+
+
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 4384a796d7c..d1313c3c92b 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,4 +1,5 @@
 import random
+from confection import Config
 
 import numpy
 import pytest
@@ -11,8 +12,10 @@
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
+from spacy.training.initialize import init_nlp
+from spacy.training.loop import train
 from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
-from spacy.util import load_config_from_str
+from spacy.util import load_config_from_str, registry, load_model_from_config
 from thinc.api import compounding
 
 from ..util import make_tempdir
@@ -1112,3 +1115,90 @@ def test_retokenized_docs(doc):
         retokenizer.merge(doc1[0:2])
         retokenizer.merge(doc1[5:7])
     assert example.get_aligned("ORTH", as_string=True) == expected2
+
+
+training_config_string = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v2"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = null
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = null
+
+[training]
+train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
+seed = 1
+gpu_allocator = "pytorch"
+dropout = 0.1
+accumulate_gradient = 3
+patience = 5000
+max_epochs = 1
+max_steps = 6
+eval_frequency = 10
+
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = False
+get_length = null
+size = 1
+buffer = 256
+"""
+
+
+def test_training_before_update(test_dir):
+    ran_before_update = False
+
+    @registry.callbacks(f"test_training_before_update_callback")
+    def make_before_creation():
+        def before_update(nlp, args):
+            nonlocal ran_before_update
+            ran_before_update = True
+            assert "step" in args
+            assert "epoch" in args
+
+        return before_update
+
+    config = Config().from_str(training_config_string, interpolate=False)
+    config["corpora"]["train"]["path"] = str(test_dir / "toy-en-corpus.spacy")
+    config["corpora"]["dev"]["path"] = str(test_dir / "toy-en-corpus.spacy")
+    config["training"]["before_update"] = {
+        "@callbacks": "test_training_before_update_callback"
+    }
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    nlp = init_nlp(config)
+    train(nlp)
+    assert ran_before_update == True
diff --git a/spacy/tests/training/toy-en-corpus.spacy b/spacy/tests/training/toy-en-corpus.spacy
new file mode 100644
index 0000000000000000000000000000000000000000..9a771be712ce2b4b83c1e9137d12b48534630f79
GIT binary patch
literal 2703
zcmV;A3UKv!ob8)=R1`-Z$3aP4FT52+ffZa_6%>!Gx*8(qf?T2zP$TJ?>4AZs?xwp3
z9F51SiP5aL;!1=t-E-j)lqjMgD&nzgAjUJU?(<AmS5X8)Jcy);R{p4LzpmKxX0z{Y
zzp45T^HF|N)xY}veqG(HvSs58tD{I^RA7(=<2Z)RpAs4sJvAmaE<T~gwYtXjc#Z1`
zD@SGGBzrt)Xv&jr(eJrEeCwT%`qHD>wK^B*xH=?wq)+n)hjzKb<i*kMrApwRj?EW`
z`6n&vJqIYvd*%)--MuUR#%76M1U_zkc}Uq=Uv$y^$x?e5P?*o%Z#?ULx%=^$GYcP@
zt8ZsX{T?|x_f9gLs)ou03iAaqgFIfFcCAyP#CTevL&{FknB<`gg<=9I%)Pa9>qe^{
z^eC6G^BmF6^G;sv2K08D02Jn<Q#N3Y7dwyJpz!>Z`sbC*owh6gub!R1`Y<-w1Lk1f
z_A4CEg~+S^e+k&*4HV|X0_Liw_+D<@B4OtlqMbr!<4Rl5fX{%!9Qa$Fgq^2|c2*0*
z%?|24Bv6=xb*j|g);;lYEtya-7AVXe<!{5LrZs=|^>7PNm~Xu)crMJpaqbhDxFct$
z;cjfh>+X-I0EPMGPu;3NJZapZ)UT7H)u$=rga)86pX1Z3?B08goRnXGk2r6UYfm>%
zYVscr6z0GWb}IBYzG`R8{mYAsflBd15_TF9?SQ=Vs`<wiQmG?mGM3%?#R7kzFuyZ?
z!M^uZRGreiaOG1Qb~EK~lYqk9QGUG%`(r}VWB&@EFn5$+UyrD~&?oNwGN3Ro@jDf{
z<ISw|QgH{>uh$P*(DbNzHwF~uSC?hkN;{5Ct&p(O;v~OD%?nBQ>EN_4yX*Zu>blhS
z>c8e^<8hE*KlZKiTAui{87Rzqe&4g`YEslurG71!7e<`XM6PPE#Y)w0sD9nE_TbIn
z@4h(=6y_kGDy<V!o3~g`BqjV4D9k~<Q>D<aAKiY#=f=E#2Z6%eQSqR7$I<TdLx#2o
z3iCfD)cT&nRO8DO&YN4U?mIW4r2;6-9Tg8szbvI|Jha6?Vcsp6o!GG|yH0AIqQ(P~
zSz?}a_LE&eVJ=tipvHrxgNsudvPSL$3UfI>Ks`SpwW&jWt9=gxg*iAsO6%*7kC)G1
zc{p(#P?-0*<Ca^var|JZ^Ya+dj?y~53DHhS+5LW+?BtO^VGiP`()d>4mG0Wx;`sn5
z%x{+89u?6es@*z;eqiSF+wWbndL&Soue-lEc6?ZLAF1n-M~L&bA&(tr)w-tvg}I~R
z+eZ&K9Y{8ntN;pgN7Xydif@O*!?Ie-`c3inzDE0PsEmL)YJDJQXGRdcr+=5?c0ggi
z_1W|hv$^Gq4=aoZO6vpE@u1cRQh6J7Jmqgx&L30|5eF3Ja(ST@(N65}+|_4pmv{n&
zxufy|yK$-Ze7yHLpfGn-UQn91QS(Ab%Del!8zyfB3Uj$Q`fJ-iK#e;aCG33rD~mgF
z`>@W6JO5v&|M~dsuoIUChL&Fh3Uj%-3bnr0TKLBB?yDkz!W{U4R2_m^Uk7fBSUGs*
zfvG@YF1LSx>Ia>TUco0vy-5Ovxm?_F*8VkW9b#k$zoXt4GXp5h<@T>p{lHo4ROxjv
z_z!Y+vRAmJwEKFS2`J3v@&ao9S#*0=bCd4%=FpP+=UrAO_JMhiiI<MtS{8I=`c(hH
z&6PWTwr_?Scd!Q)V>`cf$^$6O<>HRB_TSDT+Oc2TJO6S+UM`jQINxu;gGQL6?uW?j
zzjbTKYjoAGQUirKxS!>$e5!Px@2(e_7*5pRt9iS&@nXzw7Upu-Kd9@pV0~jmNY{r;
zU$wqQjiXz<u&u#^uB{p4-epJ%`Jflf!TOQ%Yt-?~GI_@wtpCGopfCsbf0FIzN^Y_K
zC9i7dK;Ho$t*&eF1uDg*)(`5u6{VjoYmBNi0{x}qLG|l*-5ZCL^6_1P!W{UU)cq{f
zxTAF5P_IkQjeT(M_OeTifBl5d7ZXLhVGjIlyTZKC^fcvU%aN$jKw-YvZDdZGU-=hO
zc2LhxRF|rNfz|(bGHZFkfU1tyv!%vUpzyk6%;C57!3}k}Kw&PI7f|>4?sy7MPqow!
z&+BXHZ7)6W4a`xm=j8H2)3%V$r=7po11QXaU&}pLBR3v)-dbkS>Hr)l%>UP(tJ$2<
z+jG5sPzq3(JF1RXT3@5a(WFhQBC{5jTmlMnx%wKlj(65Rg-5UX?Ryq5%YedMu0BAW
zH>GtV>T?cz79Gj%GJDGh?)7W`-l?U0C(J<|VpI71o1>m{kQ-0H_=H1GrZ)WsD9k~f
zxK_drD3IjrIP3YH%Csx)&x^170EM}u{QAm6mu1NtZ?yvob4Ts_J8PdDHC{{k0qVZX
zsa((YhEC`D0EM}u?mJ(K4@pqXylj0sXY`G)p13@NxufEav-VxAg;dwIRo@N*3Uf#8
zyC_|!sQV!+1iuUV4qN*Ig}I~lL(1kCUdlgxr3~mVwS($!F2^Sy@%h)@)}ItQ<owN9
z*CnqZj;FFUcSI(=E*vP#L0*uGJ8p<}l;$~9f2+7UVQWtG-7!F6F1Jow5zkNVj)+;u
z29EwWP^s}qty9#vGoVNHhL6{5KL!-$;C{W*`daDzderqZ{QkJJv3}9TKw<8vcpafi
zT1K$fvVg)|u1-Xb*DQ^xakkp%`rkTi1Wu-L>1?$}ck%ml5ZfrRbrV}BvGo_5jpY~}
znZ{mq*;HuyooT4)_ofk+fSAOP082ok*qZ`{z`#I0Xp|)=XyTYLrb(t?J~WgM4HH7c
zSbHc%P^1nc_^=uF@R{L>De-aPrYJrtj*rsvQJg87j}~7^LbQS4V-on-Fg`9$h>J@Q
z+jKsj6ygnxkdT;Q7MmZR7-CM0kDtLOCh#-j`I#ZY%#bj#MT#v>YzbnU#V02VDU`v4
z37AG>!8F-w!YcN9N+Vzd!DD(e_Tzg}W5Z|~#~9WkD#Z#IgA15po?S&T43&dx>?#Jw
zGR2WwR0bWPF{=y=DZVx0jG)FyUac0?8rG&JaEvt2cJ<3~ZdX%!J<h1bx8i4dnlg|Y
zvzj6^#Ev%FG&q5CxJJ<6wAeUaLzy*{DnsmYNIRa5Ge(NU&02=i3uy+UHNEXClIC<Q
z8@HrW1{OEzthyNn5@&gxme+|rXVr0SKbtay3`!+rh|`#nV<j+}qi73(apJcnupAqq
zQ!yB0v=TawBXFLe&3a5niXD?_)l)1-7%h4Ot5ciBp5ygKfy8sRi;YcVC<D!!Npa42
zk`hQN$4=5O=F>v5233ZYH0awVl+tQ>iWJ1J&r^&*85lDq4o9FdqnU0SAx%+?*s-jc
zW^j5Z%}`pLZTp)dj1Xvsv(N@o%~=?nP!j^9qpf12agB||nQWaJx3NZ2EuITAmyU4)
zmyU~XaD7{66gzPuILgW~m^zcS<zNii_AhgET)Jgm+c^-7l))@E#@^PkIH5JGF^<h$
z^5bFA5?C6W`{$B$j7>F=nc@)1)EDdWnA1MoAu5^rVzP6qUDrXcBJtESsTz#KM7RWc
J{Qw%gck$eeb?X2C

literal 0
HcmV?d00001


From d6d5c52135c3550f794152b93fc117f1a48d368c Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Fri, 18 Nov 2022 13:53:35 +0100
Subject: [PATCH 07/10] Simplify test

---
 spacy/tests/conftest.py                  |   6 --
 spacy/tests/training/test_training.py    |  82 +++++++++--------------
 spacy/tests/training/toy-en-corpus.spacy | Bin 2703 -> 0 bytes
 3 files changed, 31 insertions(+), 57 deletions(-)
 delete mode 100644 spacy/tests/training/toy-en-corpus.spacy

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index c17fde0e87b..f34b07bd5af 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -48,12 +48,6 @@ def getopt(opt):
             pytest.skip("not referencing any issues")
 
 
-@pytest.fixture
-def test_dir(request):
-    print(request.fspath)
-    return Path(request.fspath).parent
-
-
 # Fixtures for language tokenizers (languages sorted alphabetically)
 
 
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index d1313c3c92b..37808db5be5 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -12,11 +12,10 @@
 from spacy.training.alignment_array import AlignmentArray
 from spacy.training.align import get_alignments
 from spacy.training.converters import json_to_docs
-from spacy.training.initialize import init_nlp
-from spacy.training.loop import train
+from spacy.training.loop import train_while_improving
 from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
-from spacy.util import load_config_from_str, registry, load_model_from_config
-from thinc.api import compounding
+from spacy.util import load_config_from_str, load_model_from_config
+from thinc.api import compounding, Adam
 
 from ..util import make_tempdir
 
@@ -1146,59 +1145,40 @@ def test_retokenized_docs(doc):
 [components.tagger.model.tok2vec]
 @architectures = "spacy.Tok2VecListener.v1"
 width = ${components.tok2vec.model.width}
-
-[corpora]
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = null
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = null
-
-[training]
-train_corpus = "corpora.train"
-dev_corpus = "corpora.dev"
-seed = 1
-gpu_allocator = "pytorch"
-dropout = 0.1
-accumulate_gradient = 3
-patience = 5000
-max_epochs = 1
-max_steps = 6
-eval_frequency = 10
-
-[training.batcher]
-@batchers = "spacy.batch_by_padded.v1"
-discard_oversize = False
-get_length = null
-size = 1
-buffer = 256
 """
 
 
-def test_training_before_update(test_dir):
-    ran_before_update = False
+def test_training_before_update(doc):
+    def before_update(nlp, args):
+        assert args["step"] == 0
+        assert args["epoch"] == 1
 
-    @registry.callbacks(f"test_training_before_update_callback")
-    def make_before_creation():
-        def before_update(nlp, args):
-            nonlocal ran_before_update
-            ran_before_update = True
-            assert "step" in args
-            assert "epoch" in args
+        # Raise an error here as the rest of the loop
+        # will not run to completion due to uninitialized
+        # models.
+        raise ValueError("ran_before_update")
 
-        return before_update
+    def generate_batch():
+        yield 1, [Example(doc, doc)]
 
     config = Config().from_str(training_config_string, interpolate=False)
-    config["corpora"]["train"]["path"] = str(test_dir / "toy-en-corpus.spacy")
-    config["corpora"]["dev"]["path"] = str(test_dir / "toy-en-corpus.spacy")
-    config["training"]["before_update"] = {
-        "@callbacks": "test_training_before_update_callback"
-    }
     nlp = load_model_from_config(config, auto_fill=True, validate=True)
+    optimizer = Adam()
+    generator = train_while_improving(
+        nlp,
+        optimizer,
+        generate_batch(),
+        lambda: None,
+        dropout=0.1,
+        eval_frequency=100,
+        accumulate_gradient=10,
+        patience=10,
+        max_steps=100,
+        exclude=[],
+        annotating_components=[],
+        before_update=before_update,
+    )
 
-    nlp = init_nlp(config)
-    train(nlp)
-    assert ran_before_update == True
+    with pytest.raises(ValueError, match="ran_before_update"):
+        for _ in generator:
+            pass
diff --git a/spacy/tests/training/toy-en-corpus.spacy b/spacy/tests/training/toy-en-corpus.spacy
deleted file mode 100644
index 9a771be712ce2b4b83c1e9137d12b48534630f79..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2703
zcmV;A3UKv!ob8)=R1`-Z$3aP4FT52+ffZa_6%>!Gx*8(qf?T2zP$TJ?>4AZs?xwp3
z9F51SiP5aL;!1=t-E-j)lqjMgD&nzgAjUJU?(<AmS5X8)Jcy);R{p4LzpmKxX0z{Y
zzp45T^HF|N)xY}veqG(HvSs58tD{I^RA7(=<2Z)RpAs4sJvAmaE<T~gwYtXjc#Z1`
zD@SGGBzrt)Xv&jr(eJrEeCwT%`qHD>wK^B*xH=?wq)+n)hjzKb<i*kMrApwRj?EW`
z`6n&vJqIYvd*%)--MuUR#%76M1U_zkc}Uq=Uv$y^$x?e5P?*o%Z#?ULx%=^$GYcP@
zt8ZsX{T?|x_f9gLs)ou03iAaqgFIfFcCAyP#CTevL&{FknB<`gg<=9I%)Pa9>qe^{
z^eC6G^BmF6^G;sv2K08D02Jn<Q#N3Y7dwyJpz!>Z`sbC*owh6gub!R1`Y<-w1Lk1f
z_A4CEg~+S^e+k&*4HV|X0_Liw_+D<@B4OtlqMbr!<4Rl5fX{%!9Qa$Fgq^2|c2*0*
z%?|24Bv6=xb*j|g);;lYEtya-7AVXe<!{5LrZs=|^>7PNm~Xu)crMJpaqbhDxFct$
z;cjfh>+X-I0EPMGPu;3NJZapZ)UT7H)u$=rga)86pX1Z3?B08goRnXGk2r6UYfm>%
zYVscr6z0GWb}IBYzG`R8{mYAsflBd15_TF9?SQ=Vs`<wiQmG?mGM3%?#R7kzFuyZ?
z!M^uZRGreiaOG1Qb~EK~lYqk9QGUG%`(r}VWB&@EFn5$+UyrD~&?oNwGN3Ro@jDf{
z<ISw|QgH{>uh$P*(DbNzHwF~uSC?hkN;{5Ct&p(O;v~OD%?nBQ>EN_4yX*Zu>blhS
z>c8e^<8hE*KlZKiTAui{87Rzqe&4g`YEslurG71!7e<`XM6PPE#Y)w0sD9nE_TbIn
z@4h(=6y_kGDy<V!o3~g`BqjV4D9k~<Q>D<aAKiY#=f=E#2Z6%eQSqR7$I<TdLx#2o
z3iCfD)cT&nRO8DO&YN4U?mIW4r2;6-9Tg8szbvI|Jha6?Vcsp6o!GG|yH0AIqQ(P~
zSz?}a_LE&eVJ=tipvHrxgNsudvPSL$3UfI>Ks`SpwW&jWt9=gxg*iAsO6%*7kC)G1
zc{p(#P?-0*<Ca^var|JZ^Ya+dj?y~53DHhS+5LW+?BtO^VGiP`()d>4mG0Wx;`sn5
z%x{+89u?6es@*z;eqiSF+wWbndL&Soue-lEc6?ZLAF1n-M~L&bA&(tr)w-tvg}I~R
z+eZ&K9Y{8ntN;pgN7Xydif@O*!?Ie-`c3inzDE0PsEmL)YJDJQXGRdcr+=5?c0ggi
z_1W|hv$^Gq4=aoZO6vpE@u1cRQh6J7Jmqgx&L30|5eF3Ja(ST@(N65}+|_4pmv{n&
zxufy|yK$-Ze7yHLpfGn-UQn91QS(Ab%Del!8zyfB3Uj$Q`fJ-iK#e;aCG33rD~mgF
z`>@W6JO5v&|M~dsuoIUChL&Fh3Uj%-3bnr0TKLBB?yDkz!W{U4R2_m^Uk7fBSUGs*
zfvG@YF1LSx>Ia>TUco0vy-5Ovxm?_F*8VkW9b#k$zoXt4GXp5h<@T>p{lHo4ROxjv
z_z!Y+vRAmJwEKFS2`J3v@&ao9S#*0=bCd4%=FpP+=UrAO_JMhiiI<MtS{8I=`c(hH
z&6PWTwr_?Scd!Q)V>`cf$^$6O<>HRB_TSDT+Oc2TJO6S+UM`jQINxu;gGQL6?uW?j
zzjbTKYjoAGQUirKxS!>$e5!Px@2(e_7*5pRt9iS&@nXzw7Upu-Kd9@pV0~jmNY{r;
zU$wqQjiXz<u&u#^uB{p4-epJ%`Jflf!TOQ%Yt-?~GI_@wtpCGopfCsbf0FIzN^Y_K
zC9i7dK;Ho$t*&eF1uDg*)(`5u6{VjoYmBNi0{x}qLG|l*-5ZCL^6_1P!W{UU)cq{f
zxTAF5P_IkQjeT(M_OeTifBl5d7ZXLhVGjIlyTZKC^fcvU%aN$jKw-YvZDdZGU-=hO
zc2LhxRF|rNfz|(bGHZFkfU1tyv!%vUpzyk6%;C57!3}k}Kw&PI7f|>4?sy7MPqow!
z&+BXHZ7)6W4a`xm=j8H2)3%V$r=7po11QXaU&}pLBR3v)-dbkS>Hr)l%>UP(tJ$2<
z+jG5sPzq3(JF1RXT3@5a(WFhQBC{5jTmlMnx%wKlj(65Rg-5UX?Ryq5%YedMu0BAW
zH>GtV>T?cz79Gj%GJDGh?)7W`-l?U0C(J<|VpI71o1>m{kQ-0H_=H1GrZ)WsD9k~f
zxK_drD3IjrIP3YH%Csx)&x^170EM}u{QAm6mu1NtZ?yvob4Ts_J8PdDHC{{k0qVZX
zsa((YhEC`D0EM}u?mJ(K4@pqXylj0sXY`G)p13@NxufEav-VxAg;dwIRo@N*3Uf#8
zyC_|!sQV!+1iuUV4qN*Ig}I~lL(1kCUdlgxr3~mVwS($!F2^Sy@%h)@)}ItQ<owN9
z*CnqZj;FFUcSI(=E*vP#L0*uGJ8p<}l;$~9f2+7UVQWtG-7!F6F1Jow5zkNVj)+;u
z29EwWP^s}qty9#vGoVNHhL6{5KL!-$;C{W*`daDzderqZ{QkJJv3}9TKw<8vcpafi
zT1K$fvVg)|u1-Xb*DQ^xakkp%`rkTi1Wu-L>1?$}ck%ml5ZfrRbrV}BvGo_5jpY~}
znZ{mq*;HuyooT4)_ofk+fSAOP082ok*qZ`{z`#I0Xp|)=XyTYLrb(t?J~WgM4HH7c
zSbHc%P^1nc_^=uF@R{L>De-aPrYJrtj*rsvQJg87j}~7^LbQS4V-on-Fg`9$h>J@Q
z+jKsj6ygnxkdT;Q7MmZR7-CM0kDtLOCh#-j`I#ZY%#bj#MT#v>YzbnU#V02VDU`v4
z37AG>!8F-w!YcN9N+Vzd!DD(e_Tzg}W5Z|~#~9WkD#Z#IgA15po?S&T43&dx>?#Jw
zGR2WwR0bWPF{=y=DZVx0jG)FyUac0?8rG&JaEvt2cJ<3~ZdX%!J<h1bx8i4dnlg|Y
zvzj6^#Ev%FG&q5CxJJ<6wAeUaLzy*{DnsmYNIRa5Ge(NU&02=i3uy+UHNEXClIC<Q
z8@HrW1{OEzthyNn5@&gxme+|rXVr0SKbtay3`!+rh|`#nV<j+}qi73(apJcnupAqq
zQ!yB0v=TawBXFLe&3a5niXD?_)l)1-7%h4Ot5ciBp5ygKfy8sRi;YcVC<D!!Npa42
zk`hQN$4=5O=F>v5233ZYH0awVl+tQ>iWJ1J&r^&*85lDq4o9FdqnU0SAx%+?*s-jc
zW^j5Z%}`pLZTp)dj1Xvsv(N@o%~=?nP!j^9qpf12agB||nQWaJx3NZ2EuITAmyU4)
zmyU~XaD7{66gzPuILgW~m^zcS<zNii_AhgET)Jgm+c^-7l))@E#@^PkIH5JGF^<h$
z^5bFA5?C6W`{$B$j7>F=nc@)1)EDdWnA1MoAu5^rVzP6qUDrXcBJtESsTz#KM7RWc
J{Qw%gck$eeb?X2C


From 3fd19b8bd8c8813dab2cbe627a422ac189d5f199 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Fri, 18 Nov 2022 14:50:39 +0100
Subject: [PATCH 08/10] Replace config string with `spacy.blank`

---
 spacy/tests/training/test_training.py | 39 +++------------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 37808db5be5..6d47cb6063a 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -3,6 +3,7 @@
 
 import numpy
 import pytest
+import spacy
 import srsly
 from spacy.lang.en import English
 from spacy.tokens import Doc, DocBin
@@ -14,7 +15,7 @@
 from spacy.training.converters import json_to_docs
 from spacy.training.loop import train_while_improving
 from spacy.util import get_words_and_spaces, load_model_from_path, minibatch
-from spacy.util import load_config_from_str, load_model_from_config
+from spacy.util import load_config_from_str
 from thinc.api import compounding, Adam
 
 from ..util import make_tempdir
@@ -1116,38 +1117,6 @@ def test_retokenized_docs(doc):
     assert example.get_aligned("ORTH", as_string=True) == expected2
 
 
-training_config_string = """
-[nlp]
-lang = "en"
-pipeline = ["tok2vec", "tagger"]
-
-[components]
-
-[components.tok2vec]
-factory = "tok2vec"
-
-[components.tok2vec.model]
-@architectures = "spacy.HashEmbedCNN.v1"
-pretrained_vectors = null
-width = 342
-depth = 4
-window_size = 1
-embed_size = 2000
-maxout_pieces = 3
-subword_features = true
-
-[components.tagger]
-factory = "tagger"
-
-[components.tagger.model]
-@architectures = "spacy.Tagger.v2"
-
-[components.tagger.model.tok2vec]
-@architectures = "spacy.Tok2VecListener.v1"
-width = ${components.tok2vec.model.width}
-"""
-
-
 def test_training_before_update(doc):
     def before_update(nlp, args):
         assert args["step"] == 0
@@ -1161,8 +1130,8 @@ def before_update(nlp, args):
     def generate_batch():
         yield 1, [Example(doc, doc)]
 
-    config = Config().from_str(training_config_string, interpolate=False)
-    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+    nlp = spacy.blank("en", config={"training": {}})
+    nlp.add_pipe("tagger")
     optimizer = Adam()
     generator = train_while_improving(
         nlp,

From 7fe3c5e0950a675c4fbf28011ce5c9b6948a8995 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:05:12 +0100
Subject: [PATCH 09/10] Apply suggestions from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/tests/conftest.py               | 1 -
 spacy/tests/training/test_training.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index f34b07bd5af..0fc74243da3 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -1,4 +1,3 @@
-from pathlib import Path
 import pytest
 from spacy.util import get_lang_class
 from hypothesis import settings
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index 6d47cb6063a..b657daed290 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1130,7 +1130,7 @@ def before_update(nlp, args):
     def generate_batch():
         yield 1, [Example(doc, doc)]
 
-    nlp = spacy.blank("en", config={"training": {}})
+    nlp = spacy.blank("en")
     nlp.add_pipe("tagger")
     optimizer = Adam()
     generator = train_while_improving(

From e59af45354ffea82cb3a12f0e1151087d6ba24bb Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:19:50 +0100
Subject: [PATCH 10/10] Cleanup imports

---
 spacy/tests/training/test_training.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index b657daed290..7933ea31fff 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -1,5 +1,4 @@
 import random
-from confection import Config
 
 import numpy
 import pytest