Sync vocab in vectors and components sourced in configs (#9335)

Since a component may reference anything in the vocab, share the full vocab when loading source components and vectors (which will include `strings` as of #8909). When loading a source component from a config, save and restore the vocab state after loading source pipelines, in particular to preserve the original state without vectors, since `[initialize.vectors] = null` skips rather than resets the vectors. The vocab references are not synced for components loaded with `Language.add_pipe(source=)` because the pipelines are already loaded and not necessarily with the same vocab. A warning could be added in `Language.create_pipe_from_source` that it may be necessary to save and reload before training, but it's a rare enough case that this kind of warning may be too noisy overall.
explosion · Oct 4, 2021 · 4192e71 · 4192e71
1 parent 6e833b6
commit 4192e71
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 16 deletions.
diff --git a/spacy/language.py b/spacy/language.py
@@ -707,8 +707,9 @@ def create_pipe_from_source(
  source_config = source.config.interpolate()
  pipe_config = util.copy_config(source_config["components"][source_name])
  self._pipe_configs[name] = pipe_config
- for s in source.vocab.strings:
- self.vocab.strings.add(s)
+ if self.vocab.strings != source.vocab.strings:
+ for s in source.vocab.strings:
+ self.vocab.strings.add(s)
  return pipe, pipe_config["factory"]
 
  def add_pipe(
@@ -1700,6 +1701,7 @@ def from_config(
  # them here so they're only loaded once
  source_nlps = {}
  source_nlp_vectors_hashes = {}
+ vocab_b = None
  for pipe_name in config["nlp"]["pipeline"]:
  if pipe_name not in pipeline:
  opts = ", ".join(pipeline.keys())
@@ -1722,14 +1724,22 @@ def from_config(
  raw_config=raw_config,
  )
  else:
+ # We need the sourced components to reference the same
+ # vocab without modifying the current vocab state **AND**
+ # we still want to load the source model vectors to perform
+ # the vectors check. Since the source vectors clobber the
+ # current ones, we save the original vocab state and
+ # restore after this loop. Existing strings are preserved
+ # during deserialization, so they do not need any
+ # additional handling.
+ if vocab_b is None:
+ vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
  model = pipe_cfg["source"]
  if model not in source_nlps:
- # We only need the components here and we intentionally
- # do not load the model with the same vocab because
- # this would cause the vectors to be copied into the
- # current nlp object (all the strings will be added in
- # create_pipe_from_source)
- source_nlps[model] = util.load_model(model)
+ # Load with the same vocab, adding any strings
+ source_nlps[model] = util.load_model(
+ model, vocab=nlp.vocab, exclude=["lookups"]
+ )
  source_name = pipe_cfg.get("component", pipe_name)
  listeners_replaced = False
  if "replace_listeners" in pipe_cfg:
@@ -1756,6 +1766,9 @@ def from_config(
  # Delete from cache if listeners were replaced
  if listeners_replaced:
  del source_nlps[model]
+ # Restore the original vocab after sourcing if necessary
+ if vocab_b is not None:
+ nlp.vocab.from_bytes(vocab_b)
  disabled_pipes = [*config["nlp"]["disabled"], *disable]
  nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
  nlp.batch_size = config["nlp"]["batch_size"]

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
@@ -144,7 +144,12 @@ def load_vectors_into_model(
 ) -> None:
  """Load word vectors from an installed model or path into a model instance."""
  try:
- vectors_nlp = load_model(name)
+ # Load with the same vocab, which automatically adds the vectors to
+ # the current nlp object. Exclude lookups so they are not modified.
+ exclude = ["lookups"]
+ if not add_strings:
+ exclude.append("strings")
+ vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
  except ConfigValidationError as e:
  title = f"Config validation error for vectors {name}"
  desc = (
@@ -158,15 +163,8 @@ def load_vectors_into_model(
  if len(vectors_nlp.vocab.vectors.keys()) == 0:
  logger.warning(Warnings.W112.format(name=name))
 
- nlp.vocab.vectors = vectors_nlp.vocab.vectors
  for lex in nlp.vocab:
  lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK)
- if add_strings:
- # I guess we should add the strings from the vectors_nlp model?
- # E.g. if someone does a similarity query, they might expect the strings.
- for key in nlp.vocab.vectors.key2row:
- if key in vectors_nlp.vocab.strings:
- nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
 
 
 def init_tok2vec(