From f64dbe6925cbdb743615ada84eb0f65950c61865 Mon Sep 17 00:00:00 2001 From: shademe Date: Wed, 2 Nov 2022 17:40:04 +0100 Subject: [PATCH 01/10] Add `training.before_update` callback This callback can be used to implement training paradigms like gradual (un)freezing of components (e.g: the Transformer) after a certain number of training steps to mitigate catastrophic forgetting during fine-tuning. --- spacy/schemas.py | 1 + spacy/training/loop.py | 5 +++++ website/docs/api/data-formats.md | 1 + 3 files changed, 7 insertions(+) diff --git a/spacy/schemas.py b/spacy/schemas.py index c824d76b90c..174409cc94f 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -329,6 +329,7 @@ class ConfigSchemaTraining(BaseModel): frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") + before_update: Optional[Callable[["Language", int], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on class Config: diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 06372cbb01c..844446bc07b 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -59,6 +59,7 @@ def train( batcher = T["batcher"] train_logger = T["logger"] before_to_disk = create_before_to_disk_callback(T["before_to_disk"]) + before_update = T["before_update"] # Helper function to save checkpoints. This is a closure for convenience, # to avoid passing in all the args all the time. @@ -89,6 +90,7 @@ def save_checkpoint(is_best): eval_frequency=T["eval_frequency"], exclude=frozen_components, annotating_components=annotating_components, + before_update=before_update, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") @@ -150,6 +152,7 @@ def train_while_improving( max_steps: int, exclude: List[str], annotating_components: List[str], + before_update: Optional[Callable[[Language, int], None]], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -198,6 +201,8 @@ def train_while_improving( words_seen = 0 start_time = timer() for step, (epoch, batch) in enumerate(train_data): + if before_update: + before_update(nlp, step) dropout = next(dropouts) # type: ignore for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index ce06c4ea85e..bdfb1edc02e 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -186,6 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and the current step. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, int], None]]~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | From 43e5fb7b6b3322d21d06fa81bdd8f9efb0800c14 Mon Sep 17 00:00:00 2001 From: shademe Date: Wed, 2 Nov 2022 18:39:52 +0100 Subject: [PATCH 02/10] Fix type annotation, default config value --- spacy/default_config.cfg | 2 ++ spacy/training/loop.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 86a72926e30..694fb732f43 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -90,6 +90,8 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" # Optional callback before nlp object is saved to disk after training before_to_disk = null +# Optional callback that is invoked at the start of each training step +before_update = null [training.logger] @loggers = "spacy.ConsoleLogger.v1" diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 844446bc07b..e8c03d9a8c4 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -152,7 +152,7 @@ def train_while_improving( max_steps: int, exclude: List[str], annotating_components: List[str], - before_update: Optional[Callable[[Language, int], None]], + before_update: Optional[Callable[["Language", int], None]], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, From e477eb2f7f3a22898f7a17c8844a50d6a3954700 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 7 Nov 2022 11:29:04 +0100 Subject: [PATCH 03/10] Generalize arguments passed to the callback --- spacy/training/loop.py | 5 +++-- website/docs/api/data-formats.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index e8c03d9a8c4..25b0df2fccc 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -152,7 +152,7 @@ def train_while_improving( max_steps: int, exclude: List[str], annotating_components: List[str], - before_update: Optional[Callable[["Language", int], None]], + before_update: Optional[Callable[["Language", Dict[str, Any]], None]], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -202,7 +202,8 @@ def train_while_improving( start_time = timer() for step, (epoch, batch) in enumerate(train_data): if before_update: - before_update(nlp, step) + before_update_args = {"current_step": step} + before_update(nlp, before_update_args) dropout = next(dropouts) # type: ignore for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index bdfb1edc02e..111d772649b 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and the current step. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, int], None]]~~ | +| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `current_step`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | From 1f0863c886599fe4e8b24387afc99de18d039778 Mon Sep 17 00:00:00 2001 From: shademe Date: Mon, 7 Nov 2022 11:29:23 +0100 Subject: [PATCH 04/10] Update schema --- spacy/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/schemas.py b/spacy/schemas.py index 174409cc94f..e48fe170207 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -329,7 +329,7 @@ class ConfigSchemaTraining(BaseModel): frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") - before_update: Optional[Callable[["Language", int], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") + before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step") # fmt: on class Config: From e19b490d75d50f95ddcec46ddcf83b062dcff523 Mon Sep 17 00:00:00 2001 From: shademe Date: Wed, 9 Nov 2022 13:55:32 +0100 Subject: [PATCH 05/10] Pass `epoch` to callback, rename `current_step` to `step` --- spacy/training/loop.py | 2 +- website/docs/api/data-formats.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 25b0df2fccc..8852577725e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -202,7 +202,7 @@ def train_while_improving( start_time = timer() for step, (epoch, batch) in enumerate(train_data): if before_update: - before_update_args = {"current_step": step} + before_update_args = {"step": step, "epoch": epoch} before_update(nlp, before_update_args) dropout = next(dropouts) # type: ignore for subbatch in subdivide_batch(batch, accumulate_gradient): diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 111d772649b..768844cf34c 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `current_step`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | +| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | From aa921ff130dde42ba84faff16f95935531ea6542 Mon Sep 17 00:00:00 2001 From: shademe Date: Fri, 18 Nov 2022 12:55:03 +0100 Subject: [PATCH 06/10] Add test --- spacy/tests/conftest.py | 7 ++ spacy/tests/training/test_training.py | 92 ++++++++++++++++++++++- spacy/tests/training/toy-en-corpus.spacy | Bin 0 -> 2703 bytes 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/training/toy-en-corpus.spacy diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 0fc74243da3..c17fde0e87b 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,3 +1,4 @@ +from pathlib import Path import pytest from spacy.util import get_lang_class from hypothesis import settings @@ -47,6 +48,12 @@ def getopt(opt): pytest.skip("not referencing any issues") +@pytest.fixture +def test_dir(request): + print(request.fspath) + return Path(request.fspath).parent + + # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 4384a796d7c..d1313c3c92b 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,4 +1,5 @@ import random +from confection import Config import numpy import pytest @@ -11,8 +12,10 @@ from spacy.training.alignment_array import AlignmentArray from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs +from spacy.training.initialize import init_nlp +from spacy.training.loop import train from spacy.util import get_words_and_spaces, load_model_from_path, minibatch -from spacy.util import load_config_from_str +from spacy.util import load_config_from_str, registry, load_model_from_config from thinc.api import compounding from ..util import make_tempdir @@ -1112,3 +1115,90 @@ def test_retokenized_docs(doc): retokenizer.merge(doc1[0:2]) retokenizer.merge(doc1[5:7]) assert example.get_aligned("ORTH", as_string=True) == expected2 + + +training_config_string = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.HashEmbedCNN.v1" +pretrained_vectors = null +width = 342 +depth = 4 +window_size = 1 +embed_size = 2000 +maxout_pieces = 3 +subword_features = true + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v2" + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.width} + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = null + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = null + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = 1 +gpu_allocator = "pytorch" +dropout = 0.1 +accumulate_gradient = 3 +patience = 5000 +max_epochs = 1 +max_steps = 6 +eval_frequency = 10 + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = False +get_length = null +size = 1 +buffer = 256 +""" + + +def test_training_before_update(test_dir): + ran_before_update = False + + @registry.callbacks(f"test_training_before_update_callback") + def make_before_creation(): + def before_update(nlp, args): + nonlocal ran_before_update + ran_before_update = True + assert "step" in args + assert "epoch" in args + + return before_update + + config = Config().from_str(training_config_string, interpolate=False) + config["corpora"]["train"]["path"] = str(test_dir / "toy-en-corpus.spacy") + config["corpora"]["dev"]["path"] = str(test_dir / "toy-en-corpus.spacy") + config["training"]["before_update"] = { + "@callbacks": "test_training_before_update_callback" + } + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + nlp = init_nlp(config) + train(nlp) + assert ran_before_update == True diff --git a/spacy/tests/training/toy-en-corpus.spacy b/spacy/tests/training/toy-en-corpus.spacy new file mode 100644 index 0000000000000000000000000000000000000000..9a771be712ce2b4b83c1e9137d12b48534630f79 GIT binary patch literal 2703 zcmV;A3UKv!ob8)=R1`-Z$3aP4FT52+ffZa_6%>!Gx*8(qf?T2zP$TJ?>4AZs?xwp3 z9F51SiP5aL;!1=t-E-j)lqjMgD&nzgAjUJU?(wK^B*xH=?wq)+n)hjzKbqe^{ z^eC6G^BmF6^G;sv2K08D02JnZ`sbC*owh6gub!R1`Y<-w1Lk1f z_A4CEg~+S^e+k&*4HV|X0_Liw_+D<@B4OtlqMbr!<4Rl5fX{%!9Qa$Fgq^2|c2*0* z%?|24Bv6=xb*j|g);;lYEtya-7AVXe7PNm~Xu)crMJpaqbhDxFct$ z;cjfh>+X-I0EPMGPu;3NJZapZ)UT7H)u$=rga)86pX1Z3?B08goRnXGk2r6UYfm>% zYVscr6z0GWb}IBYzG`R8{mYAsflBd15_TF9?SQ=Vs`uh$P*(DbNzHwF~uSC?hkN;{5Ct&p(O;v~OD%?nBQ>EN_4yX*Zu>blhS z>c8e^<8hE*KlZKiTAui{87Rzqe&4g`YEslurG71!7e<`XM6PPE#Y)w0sD9nE_TbIn z@4h(=6y_kGDyDKs`SpwW&jWt9=gxg*iAsO6%*7kC)G1 zc{p(#P?-0*4mG0Wx;`sn5 z%x{+89u?6es@*z;eqiSF+wWbndL&Soue-lEc6?ZLAF1n-M~L&bA&(tr)w-tvg}I~R z+eZ&K9Y{8ntN;pgN7Xydif@O*!?Ie-`c3inzDE0PsEmL)YJDJQXGRdcr+=5?c0ggi z_1W|hv$^Gq4=aoZO6vpE@u1cRQh6J7Jmqgx&L30|5eF3Ja(ST@(N65}+|_4pmv{n& zxufy|yK$-Ze7yHLpfGn-UQn91QS(Ab%Del!8zyfB3Uj$Q`fJ-iK#e;aCG33rD~mgF z`>@W6JO5v&|M~dsuoIUChL&Fh3Uj%-3bnr0TKLBB?yDkz!W{U4R2_m^Uk7fBSUGs* zfvG@YF1LSx>Ia>TUco0vy-5Ovxm?_F*8VkW9b#k$zoXt4GXp5h<@T>p{lHo4ROxjv z_z!Y+vRAmJwEKFS2`J3v@&ao9S#*0=bCd4%=FpP+=UrAO_JMhiiI`cf$^$6O<>HRB_TSDT+Oc2TJO6S+UM`jQINxu;gGQL6?uW?j zzjbTKYjoAGQUirKxS!>$e5!Px@2(e_7*5pRt9iS&@nXzw7Upu-Kd9@pV0~jmNY{r; zU$wqQjiXz4?sy7MPqow! z&+BXHZ7)6W4a`xm=j8H2)3%V$r=7po11QXaU&}pLBR3v)-dbkS>Hr)l%>UP(tJ$2< z+jG5sPzq3(JF1RXT3@5a(WFhQBC{5jTmlMnx%wKlj(65Rg-5UX?Ryq5%YedMu0BAW zH>GtV>T?cz79Gj%GJDGh?)7W`-l?U0C(J<|VpI71o1>m{kQ-0H_=H1GrZ)WsD9k~f zxK_drD3IjrIP3YH%Csx)&x^170EM}u{QAm6mu1NtZ?yvob4Ts_J8PdDHC{{k0qVZX zsa((YhEC`D0EM}u?mJ(K4@pqXylj0sXY`G)p13@NxufEav-VxAg;dwIRo@N*3Uf#8 zyC_|!sQV!+1iuUV4qN*Ig}I~lL(1kCUdlgxr3~mVwS($!F2^Sy@%h)@)}ItQ1?$}ck%ml5ZfrRbrV}BvGo_5jpY~} znZ{mq*;HuyooT4)_ofk+fSAOP082ok*qZ`{z`#I0Xp|)=XyTYLrb(t?J~WgM4HH7c zSbHc%P^1nc_^=uF@R{L>De-aPrYJrtj*rsvQJg87j}~7^LbQS4V-on-Fg`9$h>J@Q z+jKsj6ygnxkdT;Q7MmZR7-CM0kDtLOCh#-j`I#ZY%#bj#MT#v>YzbnU#V02VDU`v4 z37AG>!8F-w!YcN9N+Vzd!DD(e_Tzg}W5Z|~#~9WkD#Z#IgA15po?S&T43&dx>?#Jw zGR2WwR0bWPF{=y=DZVx0jG)FyUac0?8rG&JaEvt2cJ<3~ZdX%!Jv5233ZYH0awVl+tQ>iWJ1J&r^&*85lDq4o9FdqnU0SAx%+?*s-jc zW^j5Z%}`pLZTp)dj1Xvsv(N@o%~=?nP!j^9qpf12agB||nQWaJx3NZ2EuITAmyU4) zmyU~XaD7{66gzPuILgW~m^zcSF=nc@)1)EDdWnA1MoAu5^rVzP6qUDrXcBJtESsTz#KM7RWc J{Qw%gck$eeb?X2C literal 0 HcmV?d00001 From d6d5c52135c3550f794152b93fc117f1a48d368c Mon Sep 17 00:00:00 2001 From: shademe Date: Fri, 18 Nov 2022 13:53:35 +0100 Subject: [PATCH 07/10] Simplify test --- spacy/tests/conftest.py | 6 -- spacy/tests/training/test_training.py | 82 +++++++++-------------- spacy/tests/training/toy-en-corpus.spacy | Bin 2703 -> 0 bytes 3 files changed, 31 insertions(+), 57 deletions(-) delete mode 100644 spacy/tests/training/toy-en-corpus.spacy diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index c17fde0e87b..f34b07bd5af 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -48,12 +48,6 @@ def getopt(opt): pytest.skip("not referencing any issues") -@pytest.fixture -def test_dir(request): - print(request.fspath) - return Path(request.fspath).parent - - # Fixtures for language tokenizers (languages sorted alphabetically) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index d1313c3c92b..37808db5be5 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -12,11 +12,10 @@ from spacy.training.alignment_array import AlignmentArray from spacy.training.align import get_alignments from spacy.training.converters import json_to_docs -from spacy.training.initialize import init_nlp -from spacy.training.loop import train +from spacy.training.loop import train_while_improving from spacy.util import get_words_and_spaces, load_model_from_path, minibatch -from spacy.util import load_config_from_str, registry, load_model_from_config -from thinc.api import compounding +from spacy.util import load_config_from_str, load_model_from_config +from thinc.api import compounding, Adam from ..util import make_tempdir @@ -1146,59 +1145,40 @@ def test_retokenized_docs(doc): [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" width = ${components.tok2vec.model.width} - -[corpora] - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = null - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = null - -[training] -train_corpus = "corpora.train" -dev_corpus = "corpora.dev" -seed = 1 -gpu_allocator = "pytorch" -dropout = 0.1 -accumulate_gradient = 3 -patience = 5000 -max_epochs = 1 -max_steps = 6 -eval_frequency = 10 - -[training.batcher] -@batchers = "spacy.batch_by_padded.v1" -discard_oversize = False -get_length = null -size = 1 -buffer = 256 """ -def test_training_before_update(test_dir): - ran_before_update = False +def test_training_before_update(doc): + def before_update(nlp, args): + assert args["step"] == 0 + assert args["epoch"] == 1 - @registry.callbacks(f"test_training_before_update_callback") - def make_before_creation(): - def before_update(nlp, args): - nonlocal ran_before_update - ran_before_update = True - assert "step" in args - assert "epoch" in args + # Raise an error here as the rest of the loop + # will not run to completion due to uninitialized + # models. + raise ValueError("ran_before_update") - return before_update + def generate_batch(): + yield 1, [Example(doc, doc)] config = Config().from_str(training_config_string, interpolate=False) - config["corpora"]["train"]["path"] = str(test_dir / "toy-en-corpus.spacy") - config["corpora"]["dev"]["path"] = str(test_dir / "toy-en-corpus.spacy") - config["training"]["before_update"] = { - "@callbacks": "test_training_before_update_callback" - } nlp = load_model_from_config(config, auto_fill=True, validate=True) + optimizer = Adam() + generator = train_while_improving( + nlp, + optimizer, + generate_batch(), + lambda: None, + dropout=0.1, + eval_frequency=100, + accumulate_gradient=10, + patience=10, + max_steps=100, + exclude=[], + annotating_components=[], + before_update=before_update, + ) - nlp = init_nlp(config) - train(nlp) - assert ran_before_update == True + with pytest.raises(ValueError, match="ran_before_update"): + for _ in generator: + pass diff --git a/spacy/tests/training/toy-en-corpus.spacy b/spacy/tests/training/toy-en-corpus.spacy deleted file mode 100644 index 9a771be712ce2b4b83c1e9137d12b48534630f79..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2703 zcmV;A3UKv!ob8)=R1`-Z$3aP4FT52+ffZa_6%>!Gx*8(qf?T2zP$TJ?>4AZs?xwp3 z9F51SiP5aL;!1=t-E-j)lqjMgD&nzgAjUJU?(wK^B*xH=?wq)+n)hjzKbqe^{ z^eC6G^BmF6^G;sv2K08D02JnZ`sbC*owh6gub!R1`Y<-w1Lk1f z_A4CEg~+S^e+k&*4HV|X0_Liw_+D<@B4OtlqMbr!<4Rl5fX{%!9Qa$Fgq^2|c2*0* z%?|24Bv6=xb*j|g);;lYEtya-7AVXe7PNm~Xu)crMJpaqbhDxFct$ z;cjfh>+X-I0EPMGPu;3NJZapZ)UT7H)u$=rga)86pX1Z3?B08goRnXGk2r6UYfm>% zYVscr6z0GWb}IBYzG`R8{mYAsflBd15_TF9?SQ=Vs`uh$P*(DbNzHwF~uSC?hkN;{5Ct&p(O;v~OD%?nBQ>EN_4yX*Zu>blhS z>c8e^<8hE*KlZKiTAui{87Rzqe&4g`YEslurG71!7e<`XM6PPE#Y)w0sD9nE_TbIn z@4h(=6y_kGDyDKs`SpwW&jWt9=gxg*iAsO6%*7kC)G1 zc{p(#P?-0*4mG0Wx;`sn5 z%x{+89u?6es@*z;eqiSF+wWbndL&Soue-lEc6?ZLAF1n-M~L&bA&(tr)w-tvg}I~R z+eZ&K9Y{8ntN;pgN7Xydif@O*!?Ie-`c3inzDE0PsEmL)YJDJQXGRdcr+=5?c0ggi z_1W|hv$^Gq4=aoZO6vpE@u1cRQh6J7Jmqgx&L30|5eF3Ja(ST@(N65}+|_4pmv{n& zxufy|yK$-Ze7yHLpfGn-UQn91QS(Ab%Del!8zyfB3Uj$Q`fJ-iK#e;aCG33rD~mgF z`>@W6JO5v&|M~dsuoIUChL&Fh3Uj%-3bnr0TKLBB?yDkz!W{U4R2_m^Uk7fBSUGs* zfvG@YF1LSx>Ia>TUco0vy-5Ovxm?_F*8VkW9b#k$zoXt4GXp5h<@T>p{lHo4ROxjv z_z!Y+vRAmJwEKFS2`J3v@&ao9S#*0=bCd4%=FpP+=UrAO_JMhiiI`cf$^$6O<>HRB_TSDT+Oc2TJO6S+UM`jQINxu;gGQL6?uW?j zzjbTKYjoAGQUirKxS!>$e5!Px@2(e_7*5pRt9iS&@nXzw7Upu-Kd9@pV0~jmNY{r; zU$wqQjiXz4?sy7MPqow! z&+BXHZ7)6W4a`xm=j8H2)3%V$r=7po11QXaU&}pLBR3v)-dbkS>Hr)l%>UP(tJ$2< z+jG5sPzq3(JF1RXT3@5a(WFhQBC{5jTmlMnx%wKlj(65Rg-5UX?Ryq5%YedMu0BAW zH>GtV>T?cz79Gj%GJDGh?)7W`-l?U0C(J<|VpI71o1>m{kQ-0H_=H1GrZ)WsD9k~f zxK_drD3IjrIP3YH%Csx)&x^170EM}u{QAm6mu1NtZ?yvob4Ts_J8PdDHC{{k0qVZX zsa((YhEC`D0EM}u?mJ(K4@pqXylj0sXY`G)p13@NxufEav-VxAg;dwIRo@N*3Uf#8 zyC_|!sQV!+1iuUV4qN*Ig}I~lL(1kCUdlgxr3~mVwS($!F2^Sy@%h)@)}ItQ1?$}ck%ml5ZfrRbrV}BvGo_5jpY~} znZ{mq*;HuyooT4)_ofk+fSAOP082ok*qZ`{z`#I0Xp|)=XyTYLrb(t?J~WgM4HH7c zSbHc%P^1nc_^=uF@R{L>De-aPrYJrtj*rsvQJg87j}~7^LbQS4V-on-Fg`9$h>J@Q z+jKsj6ygnxkdT;Q7MmZR7-CM0kDtLOCh#-j`I#ZY%#bj#MT#v>YzbnU#V02VDU`v4 z37AG>!8F-w!YcN9N+Vzd!DD(e_Tzg}W5Z|~#~9WkD#Z#IgA15po?S&T43&dx>?#Jw zGR2WwR0bWPF{=y=DZVx0jG)FyUac0?8rG&JaEvt2cJ<3~ZdX%!Jv5233ZYH0awVl+tQ>iWJ1J&r^&*85lDq4o9FdqnU0SAx%+?*s-jc zW^j5Z%}`pLZTp)dj1Xvsv(N@o%~=?nP!j^9qpf12agB||nQWaJx3NZ2EuITAmyU4) zmyU~XaD7{66gzPuILgW~m^zcSF=nc@)1)EDdWnA1MoAu5^rVzP6qUDrXcBJtESsTz#KM7RWc J{Qw%gck$eeb?X2C From 3fd19b8bd8c8813dab2cbe627a422ac189d5f199 Mon Sep 17 00:00:00 2001 From: shademe Date: Fri, 18 Nov 2022 14:50:39 +0100 Subject: [PATCH 08/10] Replace config string with `spacy.blank` --- spacy/tests/training/test_training.py | 39 +++------------------------ 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 37808db5be5..6d47cb6063a 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -3,6 +3,7 @@ import numpy import pytest +import spacy import srsly from spacy.lang.en import English from spacy.tokens import Doc, DocBin @@ -14,7 +15,7 @@ from spacy.training.converters import json_to_docs from spacy.training.loop import train_while_improving from spacy.util import get_words_and_spaces, load_model_from_path, minibatch -from spacy.util import load_config_from_str, load_model_from_config +from spacy.util import load_config_from_str from thinc.api import compounding, Adam from ..util import make_tempdir @@ -1116,38 +1117,6 @@ def test_retokenized_docs(doc): assert example.get_aligned("ORTH", as_string=True) == expected2 -training_config_string = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 342 -depth = 4 -window_size = 1 -embed_size = 2000 -maxout_pieces = 3 -subword_features = true - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v2" - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.width} -""" - - def test_training_before_update(doc): def before_update(nlp, args): assert args["step"] == 0 @@ -1161,8 +1130,8 @@ def before_update(nlp, args): def generate_batch(): yield 1, [Example(doc, doc)] - config = Config().from_str(training_config_string, interpolate=False) - nlp = load_model_from_config(config, auto_fill=True, validate=True) + nlp = spacy.blank("en", config={"training": {}}) + nlp.add_pipe("tagger") optimizer = Adam() generator = train_while_improving( nlp, From 7fe3c5e0950a675c4fbf28011ce5c9b6948a8995 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Wed, 23 Nov 2022 13:05:12 +0100 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: Adriane Boyd --- spacy/tests/conftest.py | 1 - spacy/tests/training/test_training.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index f34b07bd5af..0fc74243da3 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,4 +1,3 @@ -from pathlib import Path import pytest from spacy.util import get_lang_class from hypothesis import settings diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 6d47cb6063a..b657daed290 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1130,7 +1130,7 @@ def before_update(nlp, args): def generate_batch(): yield 1, [Example(doc, doc)] - nlp = spacy.blank("en", config={"training": {}}) + nlp = spacy.blank("en") nlp.add_pipe("tagger") optimizer = Adam() generator = train_while_improving( From e59af45354ffea82cb3a12f0e1151087d6ba24bb Mon Sep 17 00:00:00 2001 From: shademe Date: Wed, 23 Nov 2022 13:19:50 +0100 Subject: [PATCH 10/10] Cleanup imports --- spacy/tests/training/test_training.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index b657daed290..7933ea31fff 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,5 +1,4 @@ import random -from confection import Config import numpy import pytest