Add TextCatReduce.v1 (explosion#13181)

* Add TextCatReduce.v1 This is a textcat classifier that pools the vectors generated by a tok2vec implementation and then applies a classifier to the pooled representation. Three reductions are supported for pooling: first, max, and mean. When multiple reductions are enabled, the reductions are concatenated before providing them to the classification layer. This model is a generalization of the TextCatCNN model, which only supports mean reductions and is a bit of a misnomer, because it can also be used with transformers. This change also reimplements TextCatCNN.v2 using the new TextCatReduce.v1 layer. * Doc fixes Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence * Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy * Add back a test for TextCatCNN.v2 * Replace TextCatCNN in pipe configurations and templates * Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor * Add last reduction (`use_reduce_last`) * Remove non-working TextCatCNN Netlify redirect * Revert layer changes for the quickstart * Revert one more quickstart change * Remove unused import * Fix docstring * Fix setting name in error message --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
jordankanter · Mar 14, 2024 · 845ce57 · 845ce57
1 parent de480b6
commit 845ce57
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 158 deletions.
diff --git a/spacy/errors.py b/spacy/errors.py
@@ -974,6 +974,9 @@ class Errors(metaclass=ErrorsWithCodes):
  E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
  "but only callbacks with one or three parameters are supported")
  E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+ E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+ "reduction. Please enable one of `use_reduce_first`, "
+ "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
  # v4 error strings
  E4000 = ("Expected a Doc as input, but got: '{type}'")

diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
@@ -22,6 +22,9 @@
  reduce_first,
  reduce_last,
  reduce_max,
+ reduce_first,
+ reduce_last,
+ reduce_max,
  reduce_mean,
  reduce_sum,
  residual,
@@ -63,6 +66,15 @@ def build_simple_cnn_text_classifier(
  use_reduce_mean=True,
  nO=nO,
  )
+ return build_reduce_text_classifier(
+ tok2vec=tok2vec,
+ exclusive_classes=exclusive_classes,
+ use_reduce_first=False,
+ use_reduce_last=False,
+ use_reduce_max=False,
+ use_reduce_mean=True,
+ nO=nO,
+ )
 
 
 def resize_and_set_ref(model, new_nO, resizable_layer):
@@ -221,79 +233,6 @@ def build_text_classifier_lowdata(
  return model
 
 
-@registry.architectures("spacy.TextCatParametricAttention.v1")
-def build_textcat_parametric_attention_v1(
- tok2vec: Model[List[Doc], List[Floats2d]],
- exclusive_classes: bool,
- nO: Optional[int] = None,
-) -> Model[List[Doc], Floats2d]:
- width = tok2vec.maybe_get_dim("nO")
- parametric_attention = _build_parametric_attention_with_residual_nonlinear(
- tok2vec=tok2vec,
- nonlinear_layer=Maxout(nI=width, nO=width),
- key_transform=Gelu(nI=width, nO=width),
- )
- with Model.define_operators({">>": chain}):
- if exclusive_classes:
- output_layer = Softmax(nO=nO)
- else:
- output_layer = Linear(nO=nO) >> Logistic()
- model = parametric_attention >> output_layer
- if model.has_dim("nO") is not False and nO is not None:
- model.set_dim("nO", cast(int, nO))
- model.set_ref("output_layer", output_layer)
- model.attrs["multi_label"] = not exclusive_classes
-
- return model
-
-
-def _build_parametric_attention_with_residual_nonlinear(
- *,
- tok2vec: Model[List[Doc], List[Floats2d]],
- nonlinear_layer: Model[Floats2d, Floats2d],
- key_transform: Optional[Model[Floats2d, Floats2d]] = None,
-) -> Model[List[Doc], Floats2d]:
- with Model.define_operators({">>": chain, "|": concatenate}):
- width = tok2vec.maybe_get_dim("nO")
- attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
- norm_layer = LayerNorm(nI=width)
- parametric_attention = (
- tok2vec
- >> list2ragged()
- >> attention_layer
- >> reduce_sum()
- >> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
- )
-
- parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
-
- parametric_attention.set_ref("tok2vec", tok2vec)
- parametric_attention.set_ref("attention_layer", attention_layer)
- parametric_attention.set_ref("key_transform", key_transform)
- parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
- parametric_attention.set_ref("norm_layer", norm_layer)
-
- return parametric_attention
-
-
-def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
- # When tok2vec is lazily initialized, we need to initialize it before
- # the rest of the chain to ensure that we can get its width.
- tok2vec = model.get_ref("tok2vec")
- tok2vec.initialize(X)
-
- tok2vec_width = get_tok2vec_width(model)
- model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
- model.get_ref("key_transform").set_dim("nI", tok2vec_width)
- model.get_ref("key_transform").set_dim("nO", tok2vec_width)
- model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
- model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
- model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
- model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
- init_chain(model, X, Y)
- return model
-
-
 @registry.architectures("spacy.TextCatReduce.v1")
 def build_reduce_text_classifier(
  tok2vec: Model,

diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
@@ -473,6 +473,8 @@ def test_no_resize(name, textcat_config):
  # CNN
  ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
  ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
  ],
 )
 # fmt: on
@@ -499,9 +501,9 @@ def test_resize(name, textcat_config):
  ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
  ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
  ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
- # CNN
- ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
- ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+ # REDUCE
+ ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
  ],
 )
 # fmt: on
@@ -749,12 +751,9 @@ def test_overfitting_IO_multi():
  # ENSEMBLE V2
  ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
  ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
- # CNN V2
+ # CNN V2 (legacy)
  ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
  ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
- # PARAMETRIC ATTENTION V1
- ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
- ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
  # REDUCE V1
  ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
  ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),

diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name | Description |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
-| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
-| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name | Description |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec` | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
-| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config