explosion · adrianeboyd · Nov 23, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 7, 2022
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
@@ -90,6 +90,8 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 # Optional callback before nlp object is saved to disk after training
 before_to_disk = null
+# Optional callback that is invoked at the start of each training step
+before_update = null
 
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"

diff --git a/spacy/schemas.py b/spacy/schemas.py
@@ -329,6 +329,7 @@ class ConfigSchemaTraining(BaseModel):
  frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
  annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
  before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
+ before_update: Optional[Callable[["Language", Dict[str, Any]], None]] = Field(..., title="Optional callback that is invoked at the start of each training step")
  # fmt: on
 
  class Config:

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
@@ -59,6 +59,7 @@ def train(
  batcher = T["batcher"]
  train_logger = T["logger"]
  before_to_disk = create_before_to_disk_callback(T["before_to_disk"])
+ before_update = T["before_update"]
 
  # Helper function to save checkpoints. This is a closure for convenience,
  # to avoid passing in all the args all the time.
@@ -89,6 +90,7 @@ def save_checkpoint(is_best):
  eval_frequency=T["eval_frequency"],
  exclude=frozen_components,
  annotating_components=annotating_components,
+ before_update=before_update,
  )
  clean_output_dir(output_path)
  stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
@@ -150,6 +152,7 @@ def train_while_improving(
  max_steps: int,
  exclude: List[str],
  annotating_components: List[str],
+ before_update: Optional[Callable[["Language", Dict[str, Any]], None]],
 ):
  """Train until an evaluation stops improving. Works as a generator,
  with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -198,6 +201,9 @@ def train_while_improving(
  words_seen = 0
  start_time = timer()
  for step, (epoch, batch) in enumerate(train_data):
+ if before_update:
+ before_update_args = {"current_step": step}
+ before_update(nlp, before_update_args)
  dropout = next(dropouts) # type: ignore
  for subbatch in subdivide_batch(batch, accumulate_gradient):
  nlp.update(

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
@@ -186,6 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
 | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
 | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
+| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `current_step`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
 | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
 | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
 | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |