From 755e4d729ae9c1fa364165547e883f4a7c88fb4f Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 18:37:02 -0700
Subject: [PATCH 01/22] Trainer Schema Changes

---
 ludwig/schema/metadata/configs/trainer.yaml | 36 ++++++++++-----------
 ludwig/schema/trainer.py                    | 28 ++++++++--------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml
index 7769fc21342..064ecad959c 100644
--- a/ludwig/schema/metadata/configs/trainer.yaml
+++ b/ludwig/schema/metadata/configs/trainer.yaml
@@ -48,7 +48,7 @@ checkpoints_per_epoch:
 
         It is also more engaging and more valuable to ensure a frequent pulse of evaluation
         metrics, even if they are partial."
-    expected_impact: 3
+    expected_impact: 2
     related_parameters:
         - train_steps
         - steps_per_checkpoint
@@ -73,7 +73,7 @@ early_stop:
         run is quit. This can be efficient for pruning bad models earlier, but since
         the training process is inherently non-deterministic and noisy, sometimes
         improvements happen very gradually over a long period of time.
-    expected_impact: 2
+    expected_impact: 3
     related_parameters:
         - epochs
         - train_steps
@@ -114,7 +114,7 @@ eval_batch_size:
         maxing out memory limits will speed up the model training process overall.
     example_value:
         - 512
-    expected_impact: 2
+    expected_impact: 1
     other_information:
         Should only set the batch_size to a level that you can fit
         in memory
@@ -139,7 +139,7 @@ evaluate_training_set:
         training set is large, can be a huge computational cost. Turning off training
         set evaluation will lead to significant gains in training throughput and efficiency.
         For small datasets that train and evaluate quickly, the choice is trivial.
-    expected_impact: 3
+    expected_impact: 1
     suggested_values: false
     suggested_values_reasoning:
         Running full-scale evaluation on the full training
@@ -157,7 +157,7 @@ gradient_clipping:
         gradients in very deep networks. Increasing gradient clipping can help with
         model training loss curve stability, but it can also make training less efficient
         as weight at each training step is capped.
-    expected_impact: 2
+    expected_impact: 1
     suggested_values_reasoning:
         It's usually sensible to have some conservative notion
         of gradient clipping to make modeling robust to a particularly bad or noisy
@@ -209,7 +209,7 @@ learning_rate_scaling:
         can sometimes lead to better model performance. If the learning rate is hand-tuned
         for a given number of workers, setting this value to constant can be used
         to disable scale-up.
-    expected_impact: 2
+    expected_impact: 1
     suggested_values: linear or sqrt
     suggested_values_reasoning:
         Traditionally the learning rate is scaled linearly
@@ -226,7 +226,7 @@ max_batch_size:
         by auto batch size tuning and batch size increasing on plateau.
     example_value:
         - 1024
-    expected_impact: 2
+    expected_impact: 1
     related_parameters:
         - batch_size
         - increase_batch_size_on_plateau
@@ -276,7 +276,7 @@ regularization_lambda:
         is data-dependent, so you'll need to do some tuning. We recommend trying
         a handful of values (0.01, 0.02, ... 0.4) gradually increasing the value until
         training curves get worse"
-    expected_impact: 3
+    expected_impact: 2
     literature_references:
         - "https://developers.google.com/machine-learning/crash-course/regularization-for-simplicity/lambda "
     related_parameters:
@@ -297,7 +297,7 @@ regularization_type:
         \ selection, since weights are only reduced to values near 0 instead of 0.\
         \ L1 regularization has built-in feature selection.\nL1 regularization is\
         \ robust to outliers, L2 regularization is not."
-    expected_impact: 3
+    expected_impact: 2
     literature_references:
         - "https://neptune.ai/blog/fighting-overfitting-with-l1-or-l2-regularization#:~:text=The%20differences%20between%20L1%20and,regularization%20solution%20is%20non%2Dsparse. "
     related_parameters:
@@ -311,7 +311,7 @@ should_shuffle:
     description_implications:
         Turning off mini-batch shuffling can make training faster,
         but it may lead to worse performance overall as shuffling helps mitigate overfitting.
-    expected_impact: 2
+    expected_impact: 1
     literature_references:
         - "https://stats.stackexchange.com/questions/245502/why-should-we-shuffle-data-while-training-a-neural-network#:~:text=it%20helps%20the%20training%20converge,the%20order%20of%20the%20training "
     suggested_values: true
@@ -347,7 +347,7 @@ steps_per_checkpoint:
 
         It is also more engaging and more valuable to ensure a frequent pulse of evaluation
         metrics, even if they are partial."
-    expected_impact: 3
+    expected_impact: 1
     related_parameters:
         - checkpoints_per_epoch
     suggested_values: O(1k) for larger datasets
@@ -365,7 +365,7 @@ train_steps:
     description_implications:
         Decreasing this will shorten the overall runway for
         training the model.
-    expected_impact: 3
+    expected_impact: 1
     related_parameters:
         - epochs
     suggested_values: 0 (and use epochs), or 1000000, 1 for debugging
@@ -399,7 +399,7 @@ validation_field:
         This parameter affects 1) what the early stopping policy
         looks at to determine when to early stop and 2) hyperparameter optimization
         for determining the best trial.
-    expected_impact: 3
+    expected_impact: 1
     related_parameters:
         - validation_field
         - validation_metric
@@ -410,7 +410,7 @@ validation_metric:
         This parameter affects 1) what the early stopping policy
         looks at to determine when to early stop and 2) hyperparameter optimization
         for determining the best trial.
-    expected_impact: 3
+    expected_impact: 1
     related_parameters:
         - validation_field
         - validation_metric
@@ -474,7 +474,7 @@ learning_rate_scheduler:
             \ As a rule of thumb, compared to training without a schedule, you can use\
             \ a slightly higher maximum learning rate. Since the learning rate changes\
             \ over time, the whole training is not so sensitive to the value picked."
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - "https://peltarion.com/knowledge-center/documentation/modeling-view/run-a-model/optimization-principles-(in-deep-learning)/learning-rate-schedule "
         related_parameters:
@@ -498,7 +498,7 @@ learning_rate_scheduler:
             faster. This could make the model more robust to a bad (too high) initial
             learning rate, but a decay rate that is too high could prohibit the model
             from learning anything at all.
-        expected_impact: 1
+        expected_impact: 2
         literature_references:
             - "https://peltarion.com/knowledge-center/documentation/modeling-view/run-a-model/optimization-principles-(in-deep-learning)/learning-rate-schedule "
         related_parameters:
@@ -519,7 +519,7 @@ learning_rate_scheduler:
             learning rate decays.
         example_value:
             - 5000
-        expected_impact: 1
+        expected_impact: 2
         related_parameters:
             - decay_rate
             - decay_steps
@@ -552,7 +552,7 @@ learning_rate_scheduler:
             decaying the learning rate is superior to doing so continuously.
         ui_display_name: Staircase
     reduce_on_plateau:
-        expected_impact: 1
+        expected_impact: 2
         ui_display_name: Reduce On Plateau
     reduce_on_plateau_patience:
         expected_impact: 1
diff --git a/ludwig/schema/trainer.py b/ludwig/schema/trainer.py
index 8c6945aa353..7abc824ea3f 100644
--- a/ludwig/schema/trainer.py
+++ b/ludwig/schema/trainer.py
@@ -64,6 +64,20 @@ class ECDTrainerConfig(BaseTrainerConfig):
         parameter_metadata=TRAINER_METADATA["epochs"],
     )
 
+    batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
+        default=DEFAULT_BATCH_SIZE,
+        allow_none=False,
+        description=(
+            "The number of training examples utilized in one training step of the model. If ’auto’, the "
+            "biggest batch size (power of 2) that can fit in memory will be used."
+        ),
+        parameter_metadata=TRAINER_METADATA["batch_size"],
+        field_options=[
+            schema_utils.PositiveInteger(default=128, description="", allow_none=False),
+            schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
+        ],
+    )
+
     checkpoints_per_epoch: int = schema_utils.NonNegativeInteger(
         default=0,
         description=(
@@ -101,20 +115,6 @@ class ECDTrainerConfig(BaseTrainerConfig):
         parameter_metadata=TRAINER_METADATA["early_stop"],
     )
 
-    batch_size: Union[int, str] = schema_utils.OneOfOptionsField(
-        default=DEFAULT_BATCH_SIZE,
-        allow_none=False,
-        description=(
-            "The number of training examples utilized in one training step of the model. If ’auto’, the "
-            "biggest batch size (power of 2) that can fit in memory will be used."
-        ),
-        parameter_metadata=TRAINER_METADATA["batch_size"],
-        field_options=[
-            schema_utils.PositiveInteger(default=128, description="", allow_none=False),
-            schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False),
-        ],
-    )
-
     max_batch_size: int = schema_utils.PositiveInteger(
         default=MAX_POSSIBLE_BATCH_SIZE,
         allow_none=True,

From 6abed485968d27135044dde15dbe5cd6dbd1bb0c Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 18:55:12 -0700
Subject: [PATCH 02/22] Add optimizer EI

---
 ludwig/schema/optimizers.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py
index ebff605dbf5..29d6ac8ac52 100644
--- a/ludwig/schema/optimizers.py
+++ b/ludwig/schema/optimizers.py
@@ -421,11 +421,26 @@ def _jsonschema_type_mapping():
 class GradientClippingConfig(schema_utils.BaseMarshmallowConfig):
     """Dataclass that holds gradient clipping parameters."""
 
-    clipglobalnorm: Optional[float] = schema_utils.FloatRange(default=0.5, allow_none=True, description="")
+    clipglobalnorm: Optional[float] = schema_utils.FloatRange(
+        default=0.5,
+        allow_none=True,
+        description="",
+        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+    )
 
-    clipnorm: Optional[float] = schema_utils.FloatRange(default=None, allow_none=True, description="")
+    clipnorm: Optional[float] = schema_utils.FloatRange(
+        default=None,
+        allow_none=True,
+        description="",
+        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+    )
 
-    clipvalue: Optional[float] = schema_utils.FloatRange(default=None, allow_none=True, description="")
+    clipvalue: Optional[float] = schema_utils.FloatRange(
+        default=None,
+        allow_none=True,
+        description="",
+        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+    )
 
 
 @DeveloperAPI

From 5ca3330f6ebee8cd5e78f2aaa26f7f526f9017fd Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 19:47:53 -0700
Subject: [PATCH 03/22] Add optimizer metadata'

---
 ludwig/schema/metadata/__init__.py            |   1 +
 .../schema/metadata/configs/optimizers.yaml   |  59 +++++
 ludwig/schema/metadata/configs/trainer.yaml   |  15 --
 ludwig/schema/optimizers.py                   | 213 ++++++++++++++----
 4 files changed, 223 insertions(+), 65 deletions(-)
 create mode 100644 ludwig/schema/metadata/configs/optimizers.yaml

diff --git a/ludwig/schema/metadata/__init__.py b/ludwig/schema/metadata/__init__.py
index 6c93a1f0719..fbeb96ed964 100644
--- a/ludwig/schema/metadata/__init__.py
+++ b/ludwig/schema/metadata/__init__.py
@@ -33,3 +33,4 @@ def _load(fname: str) -> Dict[str, Any]:
 FEATURE_METADATA = _load("features.yaml")
 PREPROCESSING_METADATA = _load("preprocessing.yaml")
 TRAINER_METADATA = _load("trainer.yaml")
+OPTIMIZER_METADATA = _load("optimizers.yaml")
diff --git a/ludwig/schema/metadata/configs/optimizers.yaml b/ludwig/schema/metadata/configs/optimizers.yaml
new file mode 100644
index 00000000000..f28b246bb8d
--- /dev/null
+++ b/ludwig/schema/metadata/configs/optimizers.yaml
@@ -0,0 +1,59 @@
+gradient_clipping:
+    default_value_reasoning:
+        A conservative cap on the maximum gradient size to apply
+        over a single training step.
+    description_implications:
+        Gradient clipping is a technique to prevent exploding
+        gradients in very deep networks. Increasing gradient clipping can help with
+        model training loss curve stability, but it can also make training less efficient
+        as weight at each training step is capped.
+    expected_impact: 1
+    suggested_values_reasoning:
+        It's usually sensible to have some conservative notion
+        of gradient clipping to make modeling robust to a particularly bad or noisy
+        batch of examples.
+    ui_display_name: Gradient Clipping
+momentum:
+    expected_impact: 1
+weight_decay:
+    expected_impact: 1
+dampening:
+    expected_impact: 1
+nesterov:
+    expected_impact: 1
+max_iter:
+    expected_impact: 1
+max_eval:
+    expected_impact: 1
+tolerance_grad:
+    expected_impact: 1
+tolerance_change:
+    expected_impact: 1
+history_size:
+    expected_impact: 1
+line_search_fn:
+    expected_impact: 1
+betas:
+    expected_impact: 1
+amsgrad:
+    expected_impact: 1
+rho:
+    expected_impact: 1
+initial_accumulator_value:
+    expected_impact: 1
+lr_decay:
+    expected_impact: 1
+learning_rate_power:
+    expected_impact: 1
+l1_regularization_strength:
+    expected_impact: 1
+l2_regularization_strength:
+    expected_impact: 1
+momentum_decay:
+    expected_impact: 1
+alpha:
+    expected_impact: 1
+eps:
+    expected_impact: 1
+centered:
+    expected_impact: 1
\ No newline at end of file
diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml
index 064ecad959c..4f87658c399 100644
--- a/ludwig/schema/metadata/configs/trainer.yaml
+++ b/ludwig/schema/metadata/configs/trainer.yaml
@@ -148,21 +148,6 @@ evaluate_training_set:
         so it will still be easy to spot signs of overfitting like when the training-validation
         loss curves diverge.
     ui_display_name: Evaluate Training Set
-gradient_clipping:
-    default_value_reasoning:
-        A conservative cap on the maximum gradient size to apply
-        over a single training step.
-    description_implications:
-        Gradient clipping is a technique to prevent exploding
-        gradients in very deep networks. Increasing gradient clipping can help with
-        model training loss curve stability, but it can also make training less efficient
-        as weight at each training step is capped.
-    expected_impact: 1
-    suggested_values_reasoning:
-        It's usually sensible to have some conservative notion
-        of gradient clipping to make modeling robust to a particularly bad or noisy
-        batch of examples.
-    ui_display_name: Gradient Clipping
 increase_batch_size_eval_metric:
     expected_impact: 1
     ui_display_name: "Batch Size Increase: Evaluation Metric"
diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py
index 29d6ac8ac52..25b98f3ef23 100644
--- a/ludwig/schema/optimizers.py
+++ b/ludwig/schema/optimizers.py
@@ -8,7 +8,7 @@
 
 import ludwig.schema.utils as schema_utils
 from ludwig.api_annotations import DeveloperAPI
-from ludwig.schema.metadata import TRAINER_METADATA
+from ludwig.schema.metadata import OPTIMIZER_METADATA
 from ludwig.schema.metadata.parameter_metadata import convert_metadata_to_json
 from ludwig.utils.registry import Registry
 
@@ -65,10 +65,26 @@ class SGDOptimizerConfig(BaseOptimizerConfig):
        'sgd')"""
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD :
-    momentum: float = schema_utils.NonNegativeFloat(default=0.0, description="Momentum factor.")
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
-    dampening: float = schema_utils.NonNegativeFloat(default=0.0, description="Dampening for momentum.")
-    nesterov: bool = schema_utils.Boolean(default=False, description="Enables Nesterov momentum.")
+    momentum: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Momentum factor.",
+        parameter_metadata=OPTIMIZER_METADATA["momentum"]
+    )
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
+    dampening: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Dampening for momentum.",
+        parameter_metadata=OPTIMIZER_METADATA["dampening"]
+    )
+    nesterov: bool = schema_utils.Boolean(
+        default=False,
+        description="Enables Nesterov momentum.",
+        parameter_metadata=OPTIMIZER_METADATA["nesterov"]
+    )
 
 
 @DeveloperAPI
@@ -85,23 +101,42 @@ class LBFGSOptimizerConfig(BaseOptimizerConfig):
        'lbfgs')"""
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.LBFGS.html#torch.optim.LBFGS
-    max_iter: int = schema_utils.Integer(default=20, description="Maximum number of iterations per optimization step.")
+    max_iter: int = schema_utils.Integer(
+        default=20,
+        description="Maximum number of iterations per optimization step.",
+        parameter_metadata=OPTIMIZER_METADATA["max_iter"]
+    )
+
     max_eval: int = schema_utils.Integer(
         default=None,
         allow_none=True,
         description="Maximum number of function evaluations per optimization step. Default: `max_iter` * 1.25.",
+        parameter_metadata=OPTIMIZER_METADATA["max_eval"]
     )
+
     tolerance_grad: float = schema_utils.NonNegativeFloat(
-        default=1e-07, description="Termination tolerance on first order optimality."
+        default=1e-07,
+        description="Termination tolerance on first order optimality.",
+        parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"]
     )
+
     tolerance_change: float = schema_utils.NonNegativeFloat(
-        default=1e-09, description="Termination tolerance on function value/parameter changes."
+        default=1e-09,
+        description="Termination tolerance on function value/parameter changes.",
+        parameter_metadata=OPTIMIZER_METADATA["tolerance_change"]
+    )
+
+    history_size: int = schema_utils.Integer(
+        default=100,
+        description="Update history size.",
+        parameter_metadata=OPTIMIZER_METADATA["history_size"]
     )
-    history_size: int = schema_utils.Integer(default=100, description="Update history size.")
+
     line_search_fn: str = schema_utils.StringOptions(
         ["strong_wolfe"],
         default=None,
         description="Line search function to use.",
+        parameter_metadata=OPTIMIZER_METADATA["line_search_fn"]
     )
 
 
@@ -120,21 +155,28 @@ class AdamOptimizerConfig(BaseOptimizerConfig):
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam :
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
-        default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square."
+        default=(0.9, 0.999),
+        description="Coefficients used for computing running averages of gradient and its square.",
+        parameter_metadata=OPTIMIZER_METADATA["betas"]
     )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-08, description="Term added to the denominator to improve numerical stability."
+        default=1e-08,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"]
     )
 
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay (L2 penalty).")
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay (L2 penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
     amsgrad: bool = schema_utils.Boolean(
         default=False,
-        description=(
-            "Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam and"
-            "Beyond'."
-        ),
+        description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam "
+                    "and Beyond'.",
+        parameter_metadata=OPTIMIZER_METADATA["amsgrad"]
     )
 
 
@@ -153,21 +195,28 @@ class AdamWOptimizerConfig(BaseOptimizerConfig):
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam :
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
-        default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square."
+        default=(0.9, 0.999),
+        description="Coefficients used for computing running averages of gradient and its square.",
+        parameter_metadata=OPTIMIZER_METADATA["betas"]
     )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-08, description="Term added to the denominator to improve numerical stability."
+        default=1e-08,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"]
     )
 
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
     amsgrad: bool = schema_utils.Boolean(
         default=False,
-        description=(
-            "Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam and "
-            "Beyond'."
-        ),
+        description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam "
+                    "and Beyond'. ",
+        parameter_metadata=OPTIMIZER_METADATA["amsgrad"]
     )
 
 
@@ -190,13 +239,20 @@ class AdadeltaOptimizerConfig(BaseOptimizerConfig):
         min=0,
         max=1,
         description="Coefficient used for computing a running average of squared gradients.",
+        parameter_metadata=OPTIMIZER_METADATA["rho"]
     )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-06, description="Term added to the denominator to improve numerical stability."
+        default=1e-06,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"]
     )
 
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
 
 @DeveloperAPI
@@ -214,14 +270,28 @@ class AdagradOptimizerConfig(BaseOptimizerConfig):
        (default: 'adagrad')"""
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adagrad.html#torch.optim.Adagrad :
-    initial_accumulator_value: float = schema_utils.NonNegativeFloat(default=0, description="")
+    initial_accumulator_value: float = schema_utils.NonNegativeFloat(
+        default=0,
+        description="",
+        parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
+    )
 
-    lr_decay: float = schema_utils.FloatRange(default=0, description="Learning rate decay.")
+    lr_decay: float = schema_utils.FloatRange(
+        default=0,
+        description="Learning rate decay.",
+        parameter_metadata=OPTIMIZER_METADATA["lr_decay"]
+    )
 
-    weight_decay: float = schema_utils.FloatRange(default=0, description="Weight decay ($L2$ penalty).")
+    weight_decay: float = schema_utils.FloatRange(
+        default=0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
     eps: float = schema_utils.FloatRange(
-        default=1e-10, description="Term added to the denominator to improve numerical stability."
+        default=1e-10,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"]
     )
 
 
@@ -240,14 +310,22 @@ class AdamaxOptimizerConfig(BaseOptimizerConfig):
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adamax.html#torch.optim.Adamax :
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
-        default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square."
+        default=(0.9, 0.999),
+        description="Coefficients used for computing running averages of gradient and its square.",
+        parameter_metadata=OPTIMIZER_METADATA["betas"]
     )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-08, description="Term added to the denominator to improve numerical stability."
+        default=1e-08,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"]
     )
 
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
 
 # NOTE: keep ftrl and nadam optimizers out of registry:
@@ -258,13 +336,26 @@ class FtrlOptimizerConfig(BaseOptimizerConfig):
     # optimizer_class: ClassVar[torch.optim.Optimizer] = torch.optim.Ftrl
     type: str = schema_utils.ProtectedString("ftrl")
 
-    learning_rate_power: float = schema_utils.FloatRange(default=-0.5, max=0.0)
+    learning_rate_power: float = schema_utils.FloatRange(
+        default=-0.5,
+        max=0,
+        parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"]
+    )
 
-    initial_accumulator_value: float = schema_utils.NonNegativeFloat(default=0.1)
+    initial_accumulator_value: float = schema_utils.NonNegativeFloat(
+        default=0.1,
+        parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
+    )
 
-    l1_regularization_strength: float = schema_utils.NonNegativeFloat(default=0.0)
+    l1_regularization_strength: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"]
+    )
 
-    l2_regularization_strength: float = schema_utils.NonNegativeFloat(default=0.0)
+    l2_regularization_strength: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"]
+    )
 
 
 @DeveloperAPI
@@ -279,16 +370,28 @@ class NadamOptimizerConfig(BaseOptimizerConfig):
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.NAdam.html#torch.optim.NAdam :
 
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
-        default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square."
+        default=(0.9, 0.999),
+        description="Coefficients used for computing running averages of gradient and its square.",
+        parameter_metadata=OPTIMIZER_METADATA["betas"],
     )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-08, description="Term added to the denominator to improve numerical stability."
+        default=1e-08,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
-    weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
+    weight_decay: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Weight decay ($L2$ penalty).",
+        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+    )
 
-    momentum_decay: float = schema_utils.NonNegativeFloat(default=4e-3, description="Momentum decay.")
+    momentum_decay: float = schema_utils.NonNegativeFloat(
+        default=4e-3,
+        description="Momentum decay.",
+        parameter_metadata=OPTIMIZER_METADATA["momentum_decay"]
+    )
 
 
 @DeveloperAPI
@@ -305,19 +408,29 @@ class RMSPropOptimizerConfig(BaseOptimizerConfig):
        (default: 'rmsprop')"""
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html#torch.optim.RMSprop:
-    momentum: float = schema_utils.NonNegativeFloat(default=0.0, description="Momentum factor.")
+    momentum: float = schema_utils.NonNegativeFloat(
+        default=0.0,
+        description="Momentum factor.",
+        parameter_metadata=OPTIMIZER_METADATA["momentum"],
+    )
 
-    alpha: float = schema_utils.NonNegativeFloat(default=0.99, description="Smoothing constant.")
+    alpha: float = schema_utils.NonNegativeFloat(
+        default=0.99,
+        description="Smoothing constant.",
+        parameter_metadata=OPTIMIZER_METADATA["alpha"],
+    )
 
     eps: float = schema_utils.NonNegativeFloat(
-        default=1e-08, description="Term added to the denominator to improve numerical stability."
+        default=1e-08,
+        description="Term added to the denominator to improve numerical stability.",
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
     centered: bool = schema_utils.Boolean(
         default=False,
-        description=(
-            "If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its variance."
-        ),
+        description="If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its "
+                    "variance.",
+        parameter_metadata=OPTIMIZER_METADATA["centered"],
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).")
@@ -425,21 +538,21 @@ class GradientClippingConfig(schema_utils.BaseMarshmallowConfig):
         default=0.5,
         allow_none=True,
         description="",
-        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
     clipnorm: Optional[float] = schema_utils.FloatRange(
         default=None,
         allow_none=True,
         description="",
-        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
     clipvalue: Optional[float] = schema_utils.FloatRange(
         default=None,
         allow_none=True,
         description="",
-        parameter_metadata=TRAINER_METADATA["gradient_clipping"]
+        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
 
@@ -500,7 +613,7 @@ def _jsonschema_type_mapping():
                 dump_default=dump_default,
                 metadata={
                     "description": description,
-                    "parameter_metadata": convert_metadata_to_json(TRAINER_METADATA["gradient_clipping"]),
+                    "parameter_metadata": convert_metadata_to_json(OPTIMIZER_METADATA["gradient_clipping"]),
                 },
             )
         },

From 828b8766ef023907593c5015d97ee07e3b910ab7 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 20:55:08 -0700
Subject: [PATCH 04/22] Feature preprocessing done

---
 ludwig/schema/metadata/configs/features.yaml | 91 +++++++++++++++-----
 1 file changed, 69 insertions(+), 22 deletions(-)

diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml
index c1a0289f997..db686e09ce4 100644
--- a/ludwig/schema/metadata/configs/features.yaml
+++ b/ludwig/schema/metadata/configs/features.yaml
@@ -2,14 +2,16 @@ audio:
     preprocessing:
         audio_file_length_limit_in_s:
             ui_display_name: null
+            expected_impact: 2
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
             ui_display_name: Fill Value
+            expected_impact: 1
         in_memory:
             ui_display_name: null
+            expected_impact: 1
         missing_value_strategy:
             default_value_reasoning:
                 The default `fill_with_const` replaces missing
@@ -37,7 +39,7 @@ audio:
                 learning rate.
             example_value:
                 - batch
-            expected_impact: 3
+            expected_impact: 2
             literature_references:
                 - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
             related_parameters:
@@ -53,6 +55,7 @@ audio:
             ui_display_name: Normalization Type
         num_fft_points:
             ui_display_name: null
+            expected_impact: 1
         num_filter_bands:
             literature_references:
                 - "https://medium.com/analytics-vidhya/simplifying-audio-data-fft-stft-mfcc-for-machine-learning-and-deep-learning-443a2f962e0e "
@@ -61,8 +64,10 @@ audio:
                 - type
                 - window_shift_in_s
             ui_display_name: Type
+            expected_impact: 1
         padding_value:
             ui_display_name: null
+            expected_impact: 1
         type:
             default_value_reasoning:
                 The default type fbank is set based on values
@@ -98,6 +103,7 @@ audio:
                 - type
                 - num_filter_bands
             ui_display_name: Window Length in Seconds
+            expected_impact: 2
         window_shift_in_s:
             literature_references:
                 - "https://medium.com/analytics-vidhya/simplifying-audio-data-fft-stft-mfcc-for-machine-learning-and-deep-learning-443a2f962e0e "
@@ -106,18 +112,21 @@ audio:
                 - type
                 - num_filter_bands
             ui_display_name: Window Shift in Seconds
+            expected_impact: 2
         window_type:
             ui_display_name: null
+            expected_impact: 2
 bag:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
             ui_display_name: Fill Value
+            expected_impact: 1
         lowercase:
             ui_display_name: null
+            expected_impact: 2
         missing_value_strategy:
             default_value_reasoning:
                 The default `fill_with_const` replaces missing
@@ -145,7 +154,7 @@ bag:
                 may perform worse when rare tokens appear in the data
             example_value:
                 - 10000
-            expected_impact: 3
+            expected_impact: 2
             other_information: Specifying a vocab_file overrides this parameter
             related_parameters:
                 - vocab_file, pretrained_embeddings
@@ -160,6 +169,7 @@ bag:
             ui_display_name: Most common (vocabulary size)
         tokenizer:
             ui_display_name: null
+            expected_impact: 3
 binary:
     preprocessing:
         computed_fill_value:
@@ -174,7 +184,7 @@ binary:
             expected_impact: 2
             ui_display_name: Fallback True Label
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -190,16 +200,18 @@ binary:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
 category:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         lowercase:
             ui_display_name: null
+            expected_impact: 2
         missing_value_strategy:
             default_value_reasoning:
                 The default `fill_with_const` replaces missing
@@ -214,6 +226,7 @@ category:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         most_common:
             default_value_reasoning:
                 If there are more than 10000 unique categories
@@ -226,7 +239,7 @@ category:
                 may perform worse when rare tokens appear in the data
             example_value:
                 - 10000
-            expected_impact: 3
+            expected_impact: 2
             other_information: Specifying a vocab_file overrides this parameter
             related_parameters:
                 - vocab_file, pretrained_embeddings
@@ -258,11 +271,11 @@ date:
                 serves as a truncator.
             example_value:
                 - "%d %b %Y"
-            expected_impact: 1
+            expected_impact: 2
             suggested_values_reasoning: Have Ludwig figure out the date format automatically.
             ui_display_name: Datetime format
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -278,13 +291,14 @@ date:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
 h3:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -300,28 +314,36 @@ h3:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
 image:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         height:
             ui_display_name: null
+            expected_impact: 2
         in_memory:
             ui_display_name: null
+            expected_impact: 1
         infer_image_dimensions:
             ui_display_name: null
+            expected_impact: 1
         infer_image_max_height:
             ui_display_name: null
+            expected_impact: 1
         infer_image_max_width:
             ui_display_name: null
+            expected_impact: 1
         infer_image_num_channels:
             ui_display_name: null
+            expected_impact: 1
         infer_image_sample_size:
             ui_display_name: null
+            expected_impact: 1
         missing_value_strategy:
             default_value_reasoning:
                 The default `fill_with_const` replaces missing
@@ -336,10 +358,13 @@ image:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         num_channels:
             ui_display_name: null
+            expected_impact: 2
         num_processes:
             ui_display_name: null
+            expected_impact: 2
         resize_method:
             default_value_reasoning:
                 Interpolation may stretch or squish the image,
@@ -358,15 +383,17 @@ image:
             ui_display_name: Resize Method
         standardize_image:
             ui_display_name: null
+            expected_impact: 1
         width:
             ui_display_name: null
+            expected_impact: 2
 number:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -382,6 +409,7 @@ number:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         normalization:
             default_value_reasoning:
                 It could be valuable to observe how the model
@@ -412,10 +440,11 @@ sequence:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         lowercase:
             ui_display_name: null
+            expected_impact: 2
         max_sequence_length:
             default_value_reasoning:
                 The default value is 256. Every sequence will
@@ -448,6 +477,7 @@ sequence:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         most_common:
             default_value_reasoning:
                 If there are more than 10000 unique categories
@@ -460,7 +490,7 @@ sequence:
                 may perform worse when rare tokens appear in the data
             example_value:
                 - 10000
-            expected_impact: 3
+            expected_impact: 2
             other_information: Specifying a vocab_file overrides this parameter
             related_parameters:
                 - vocab_file, pretrained_embeddings
@@ -478,14 +508,19 @@ sequence:
             example_value:
                 - 3
             ui_display_name: n-gram size
+            expected_impact: 2
         padding:
             ui_display_name: null
+            expected_impact: 1
         padding_symbol:
             ui_display_name: null
+            expected_impact: 1
         tokenizer:
             ui_display_name: null
+            expected_impact: 3
         unknown_symbol:
             ui_display_name: null
+            expected_impact: 1
         vocab_file:
             default_value_reasoning:
                 The vocabulary can be parsed automatically from
@@ -496,7 +531,7 @@ sequence:
                 that fits your data, or if there are several uncommon or infrequently
                 occurring tokens that we want to guarantee to be a part of the vocabulary,
                 rather than treated as an unknown.
-            expected_impact: 2
+            expected_impact: 0
             ui_display_name: Vocab File
 set:
     preprocessing:
@@ -504,10 +539,11 @@ set:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         lowercase:
             ui_display_name: null
+            expected_impact: 2
         missing_value_strategy:
             default_value_reasoning:
                 The default `fill_with_const` replaces missing
@@ -559,7 +595,7 @@ text:
                 - missing_value_strategy, fill_value
             ui_display_name: DOCSTRING ONLY
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         lowercase:
             default_value_reasoning:
@@ -571,7 +607,7 @@ text:
                 words are seen as completely separate entities than lowercase words.
             example_value:
                 - true
-            expected_impact: 1
+            expected_impact: 2
             related_parameters:
                 - vocab_size
             suggested_values: "TRUE"
@@ -612,6 +648,7 @@ text:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         most_common:
             default_value_reasoning:
                 If there are more than 10000 unique categories
@@ -624,7 +661,7 @@ text:
                 may perform worse when rare tokens appear in the data
             example_value:
                 - 10000
-            expected_impact: 3
+            expected_impact: 2
             other_information: Specifying a vocab_file overrides this parameter
             related_parameters:
                 - vocab_file, pretrained_embeddings
@@ -642,6 +679,7 @@ text:
             example_value:
                 - 3
             ui_display_name: n-gram size
+            expected_impact: 2
         padding:
             default_value_reasoning:
                 We usually want to add padding to the end of
@@ -663,8 +701,10 @@ text:
             ui_display_name: Padding
         padding_symbol:
             ui_display_name: null
+            expected_impact: 1
         pretrained_model_name_or_path:
             ui_display_name: null
+            expected_impact: 0
         tokenizer:
             default_value_reasoning:
                 'The default tokenizer is `space_punct`, an abbreviation
@@ -699,6 +739,7 @@ text:
             ui_display_name: Tokenizer
         unknown_symbol:
             ui_display_name: null
+            expected_impact: 1
         vocab_file:
             default_value_reasoning:
                 The vocabulary can be parsed automatically from
@@ -709,7 +750,7 @@ text:
                 that fits your data, or if there are several uncommon or infrequently
                 occurring tokens that we want to guarantee to be a part of the vocabulary,
                 rather than treated as an unknown.
-            expected_impact: 2
+            expected_impact: 0
             ui_display_name: Vocab File
 timeseries:
     preprocessing:
@@ -717,7 +758,7 @@ timeseries:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -733,21 +774,26 @@ timeseries:
             related_parameters:
                 - fill_value
             ui_display_name: Missing Value Strategy
+            expected_impact: 3
         padding:
             ui_display_name: null
+            expected_impact: 1
         padding_value:
             ui_display_name: null
+            expected_impact: 1
         timeseries_length_limit:
             ui_display_name: null
+            expected_impact: 2
         tokenizer:
             ui_display_name: null
+            expected_impact: 3
 vector:
     preprocessing:
         computed_fill_value:
             internal_only: true
             ui_display_name: null
         fill_value:
-            expected_impact: 3
+            expected_impact: 1
             ui_display_name: Fill Value
         missing_value_strategy:
             default_value_reasoning:
@@ -766,3 +812,4 @@ vector:
             ui_display_name: Missing Value Strategy
         vector_size:
             ui_display_name: null
+            expected_impact: 3

From 178a9385900ef19c9a5838e699dde20dfd6e719a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 03:56:39 +0000
Subject: [PATCH 05/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../schema/metadata/configs/optimizers.yaml   |   2 +-
 ludwig/schema/optimizers.py                   | 122 ++++++------------
 2 files changed, 41 insertions(+), 83 deletions(-)

diff --git a/ludwig/schema/metadata/configs/optimizers.yaml b/ludwig/schema/metadata/configs/optimizers.yaml
index f28b246bb8d..bd68f3c847b 100644
--- a/ludwig/schema/metadata/configs/optimizers.yaml
+++ b/ludwig/schema/metadata/configs/optimizers.yaml
@@ -56,4 +56,4 @@ alpha:
 eps:
     expected_impact: 1
 centered:
-    expected_impact: 1
\ No newline at end of file
+    expected_impact: 1
diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py
index 25b98f3ef23..5f97324be4e 100644
--- a/ludwig/schema/optimizers.py
+++ b/ludwig/schema/optimizers.py
@@ -66,24 +66,16 @@ class SGDOptimizerConfig(BaseOptimizerConfig):
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD :
     momentum: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Momentum factor.",
-        parameter_metadata=OPTIMIZER_METADATA["momentum"]
+        default=0.0, description="Momentum factor.", parameter_metadata=OPTIMIZER_METADATA["momentum"]
     )
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
     dampening: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Dampening for momentum.",
-        parameter_metadata=OPTIMIZER_METADATA["dampening"]
+        default=0.0, description="Dampening for momentum.", parameter_metadata=OPTIMIZER_METADATA["dampening"]
     )
     nesterov: bool = schema_utils.Boolean(
-        default=False,
-        description="Enables Nesterov momentum.",
-        parameter_metadata=OPTIMIZER_METADATA["nesterov"]
+        default=False, description="Enables Nesterov momentum.", parameter_metadata=OPTIMIZER_METADATA["nesterov"]
     )
 
 
@@ -104,39 +96,37 @@ class LBFGSOptimizerConfig(BaseOptimizerConfig):
     max_iter: int = schema_utils.Integer(
         default=20,
         description="Maximum number of iterations per optimization step.",
-        parameter_metadata=OPTIMIZER_METADATA["max_iter"]
+        parameter_metadata=OPTIMIZER_METADATA["max_iter"],
     )
 
     max_eval: int = schema_utils.Integer(
         default=None,
         allow_none=True,
         description="Maximum number of function evaluations per optimization step. Default: `max_iter` * 1.25.",
-        parameter_metadata=OPTIMIZER_METADATA["max_eval"]
+        parameter_metadata=OPTIMIZER_METADATA["max_eval"],
     )
 
     tolerance_grad: float = schema_utils.NonNegativeFloat(
         default=1e-07,
         description="Termination tolerance on first order optimality.",
-        parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"]
+        parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"],
     )
 
     tolerance_change: float = schema_utils.NonNegativeFloat(
         default=1e-09,
         description="Termination tolerance on function value/parameter changes.",
-        parameter_metadata=OPTIMIZER_METADATA["tolerance_change"]
+        parameter_metadata=OPTIMIZER_METADATA["tolerance_change"],
     )
 
     history_size: int = schema_utils.Integer(
-        default=100,
-        description="Update history size.",
-        parameter_metadata=OPTIMIZER_METADATA["history_size"]
+        default=100, description="Update history size.", parameter_metadata=OPTIMIZER_METADATA["history_size"]
     )
 
     line_search_fn: str = schema_utils.StringOptions(
         ["strong_wolfe"],
         default=None,
         description="Line search function to use.",
-        parameter_metadata=OPTIMIZER_METADATA["line_search_fn"]
+        parameter_metadata=OPTIMIZER_METADATA["line_search_fn"],
     )
 
 
@@ -157,26 +147,24 @@ class AdamOptimizerConfig(BaseOptimizerConfig):
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
         default=(0.9, 0.999),
         description="Coefficients used for computing running averages of gradient and its square.",
-        parameter_metadata=OPTIMIZER_METADATA["betas"]
+        parameter_metadata=OPTIMIZER_METADATA["betas"],
     )
 
     eps: float = schema_utils.NonNegativeFloat(
         default=1e-08,
         description="Term added to the denominator to improve numerical stability.",
-        parameter_metadata=OPTIMIZER_METADATA["eps"]
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay (L2 penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay (L2 penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
     amsgrad: bool = schema_utils.Boolean(
         default=False,
         description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam "
-                    "and Beyond'.",
-        parameter_metadata=OPTIMIZER_METADATA["amsgrad"]
+        "and Beyond'.",
+        parameter_metadata=OPTIMIZER_METADATA["amsgrad"],
     )
 
 
@@ -197,26 +185,24 @@ class AdamWOptimizerConfig(BaseOptimizerConfig):
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
         default=(0.9, 0.999),
         description="Coefficients used for computing running averages of gradient and its square.",
-        parameter_metadata=OPTIMIZER_METADATA["betas"]
+        parameter_metadata=OPTIMIZER_METADATA["betas"],
     )
 
     eps: float = schema_utils.NonNegativeFloat(
         default=1e-08,
         description="Term added to the denominator to improve numerical stability.",
-        parameter_metadata=OPTIMIZER_METADATA["eps"]
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
     amsgrad: bool = schema_utils.Boolean(
         default=False,
         description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam "
-                    "and Beyond'. ",
-        parameter_metadata=OPTIMIZER_METADATA["amsgrad"]
+        "and Beyond'. ",
+        parameter_metadata=OPTIMIZER_METADATA["amsgrad"],
     )
 
 
@@ -239,19 +225,17 @@ class AdadeltaOptimizerConfig(BaseOptimizerConfig):
         min=0,
         max=1,
         description="Coefficient used for computing a running average of squared gradients.",
-        parameter_metadata=OPTIMIZER_METADATA["rho"]
+        parameter_metadata=OPTIMIZER_METADATA["rho"],
     )
 
     eps: float = schema_utils.NonNegativeFloat(
         default=1e-06,
         description="Term added to the denominator to improve numerical stability.",
-        parameter_metadata=OPTIMIZER_METADATA["eps"]
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
 
@@ -271,27 +255,21 @@ class AdagradOptimizerConfig(BaseOptimizerConfig):
 
     # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adagrad.html#torch.optim.Adagrad :
     initial_accumulator_value: float = schema_utils.NonNegativeFloat(
-        default=0,
-        description="",
-        parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
+        default=0, description="", parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
     )
 
     lr_decay: float = schema_utils.FloatRange(
-        default=0,
-        description="Learning rate decay.",
-        parameter_metadata=OPTIMIZER_METADATA["lr_decay"]
+        default=0, description="Learning rate decay.", parameter_metadata=OPTIMIZER_METADATA["lr_decay"]
     )
 
     weight_decay: float = schema_utils.FloatRange(
-        default=0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
     eps: float = schema_utils.FloatRange(
         default=1e-10,
         description="Term added to the denominator to improve numerical stability.",
-        parameter_metadata=OPTIMIZER_METADATA["eps"]
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
 
@@ -312,19 +290,17 @@ class AdamaxOptimizerConfig(BaseOptimizerConfig):
     betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField(
         default=(0.9, 0.999),
         description="Coefficients used for computing running averages of gradient and its square.",
-        parameter_metadata=OPTIMIZER_METADATA["betas"]
+        parameter_metadata=OPTIMIZER_METADATA["betas"],
     )
 
     eps: float = schema_utils.NonNegativeFloat(
         default=1e-08,
         description="Term added to the denominator to improve numerical stability.",
-        parameter_metadata=OPTIMIZER_METADATA["eps"]
+        parameter_metadata=OPTIMIZER_METADATA["eps"],
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
 
@@ -337,24 +313,19 @@ class FtrlOptimizerConfig(BaseOptimizerConfig):
     type: str = schema_utils.ProtectedString("ftrl")
 
     learning_rate_power: float = schema_utils.FloatRange(
-        default=-0.5,
-        max=0,
-        parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"]
+        default=-0.5, max=0, parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"]
     )
 
     initial_accumulator_value: float = schema_utils.NonNegativeFloat(
-        default=0.1,
-        parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
+        default=0.1, parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"]
     )
 
     l1_regularization_strength: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"]
+        default=0.0, parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"]
     )
 
     l2_regularization_strength: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"]
+        default=0.0, parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"]
     )
 
 
@@ -382,15 +353,11 @@ class NadamOptimizerConfig(BaseOptimizerConfig):
     )
 
     weight_decay: float = schema_utils.NonNegativeFloat(
-        default=0.0,
-        description="Weight decay ($L2$ penalty).",
-        parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
+        default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"]
     )
 
     momentum_decay: float = schema_utils.NonNegativeFloat(
-        default=4e-3,
-        description="Momentum decay.",
-        parameter_metadata=OPTIMIZER_METADATA["momentum_decay"]
+        default=4e-3, description="Momentum decay.", parameter_metadata=OPTIMIZER_METADATA["momentum_decay"]
     )
 
 
@@ -429,7 +396,7 @@ class RMSPropOptimizerConfig(BaseOptimizerConfig):
     centered: bool = schema_utils.Boolean(
         default=False,
         description="If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its "
-                    "variance.",
+        "variance.",
         parameter_metadata=OPTIMIZER_METADATA["centered"],
     )
 
@@ -535,24 +502,15 @@ class GradientClippingConfig(schema_utils.BaseMarshmallowConfig):
     """Dataclass that holds gradient clipping parameters."""
 
     clipglobalnorm: Optional[float] = schema_utils.FloatRange(
-        default=0.5,
-        allow_none=True,
-        description="",
-        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
+        default=0.5, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
     clipnorm: Optional[float] = schema_utils.FloatRange(
-        default=None,
-        allow_none=True,
-        description="",
-        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
+        default=None, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
     clipvalue: Optional[float] = schema_utils.FloatRange(
-        default=None,
-        allow_none=True,
-        description="",
-        parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
+        default=None, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"]
     )
 
 

From 3f31acf2147a0857b982f94f22ba547e5d0b41d5 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 23:10:52 -0700
Subject: [PATCH 06/22] Encoder expected impacts

---
 ludwig/schema/encoders/sequence_encoders.py  |   6 +-
 ludwig/schema/metadata/configs/encoders.yaml | 521 ++++++++++++-------
 ludwig/schema/metadata/configs/features.yaml |   3 +-
 3 files changed, 349 insertions(+), 181 deletions(-)

diff --git a/ludwig/schema/encoders/sequence_encoders.py b/ludwig/schema/encoders/sequence_encoders.py
index ab2ae41ecba..c244a4bb59e 100644
--- a/ludwig/schema/encoders/sequence_encoders.py
+++ b/ludwig/schema/encoders/sequence_encoders.py
@@ -168,7 +168,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=None,
+        default=1,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["ParallelCNN"]["num_conv_layers"],
     )
@@ -336,7 +336,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=None,
+        default=1,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["StackedCNN"]["num_conv_layers"],
     )
@@ -1063,7 +1063,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=None,
+        default=1,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["StackedCNNRNN"]["num_conv_layers"],
     )
diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml
index c448f67e3bc..f66e61739e3 100644
--- a/ludwig/schema/metadata/configs/encoders.yaml
+++ b/ludwig/schema/metadata/configs/encoders.yaml
@@ -20,7 +20,7 @@ ALBERT:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -35,6 +35,7 @@ ALBERT:
     bos_token_id:
         default_value_reasoning: Default value used in pre-trained HF encoder.
         ui_display_name: Beginning-of-Sentence Token Id
+        expected_impact: 1
     classifier_dropout_prob:
         default_value_reasoning: Huggingface default.
         description_implications:
@@ -46,7 +47,7 @@ ALBERT:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -78,7 +79,7 @@ ALBERT:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -90,6 +91,7 @@ ALBERT:
     eos_token_id:
         default_value_reasoning: Default value used in pre-trained HF encoder.
         ui_display_name: End-of-Sentence Token Id
+        expected_impact: 1
     hidden_act:
         default_value_reasoning: Taken from huggingface.
         description_implications:
@@ -97,7 +99,7 @@ ALBERT:
             the feed-forward layers of the transformer.
         example_value:
             - relu
-        expected_impact: 2
+        expected_impact: 1
         literature_references:
             - "[Hugging face docs for ALBERT config](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertConfig.hidden_act)\n\
               \r\n[Relevant StackOverflow discussion](https://ai.stackexchange.com/questions/30341/why-does-a-transformer-not-use-an-activation-function-following-the-multi-head-a)"
@@ -118,7 +120,7 @@ ALBERT:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -138,7 +140,7 @@ ALBERT:
             Increasing the hidden size makes the model larger
             and slower to train, increases the model's capacity to capture more complexity.
             It also increases the chance of overfitting.
-        expected_impact: 2
+        expected_impact: 1
         suggested_values: 10 - 2048
         suggested_values_reasoning:
             Increasing the hidden size makes sense if the
@@ -152,7 +154,7 @@ ALBERT:
             lead to the outputs of these matrices to vanish or explode
         example_value:
             - 0.02
-        expected_impact: 3
+        expected_impact: 1
         other_information: Must be greater than 0
         related_parameters:
             - weights_initializer
@@ -163,10 +165,13 @@ ALBERT:
         ui_display_name: null
     inner_group_num:
         ui_display_name: null
+        expected_impact: 1
     intermediate_size:
         ui_display_name: null
+        expected_impact: 1
     layer_norm_eps:
         ui_display_name: null
+        expected_impact: 1
     max_position_embeddings:
         default_value_reasoning: Taken from huggingface.
         description_implications:
@@ -179,7 +184,7 @@ ALBERT:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality."
-        expected_impact: 2
+        expected_impact: 1
         suggested_values: 512
         suggested_values_reasoning:
             Out of the box value based on published literature.
@@ -190,18 +195,23 @@ ALBERT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_attention_heads:
         ui_display_name: null
+        expected_impact: 1
     num_hidden_groups:
         ui_display_name: null
+        expected_impact: 1
     num_hidden_layers:
         ui_display_name: null
+        expected_impact: 1
     pad_token_id:
         ui_display_name: null
+        expected_impact: 1
     position_embedding_type:
         ui_display_name: null
+        expected_impact: 1
     pretrained_kwargs:
         default_value_reasoning: These arguments typically don't need to be specified.
         expected_impact: 1
@@ -241,6 +251,7 @@ ALBERT:
         ui_display_name: Pretrained model
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -248,7 +259,7 @@ ALBERT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -259,12 +270,14 @@ ALBERT:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     type_vocab_size:
         ui_display_name: null
+        expected_impact: 1
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -294,16 +307,19 @@ AutoTransformer:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     pretrained_kwargs:
         ui_display_name: null
+        expected_impact: 1
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     vocab:
         default_value_reasoning:
@@ -471,7 +487,7 @@ BERT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_attention_heads:
         ui_display_name: null
@@ -485,8 +501,10 @@ BERT:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -494,7 +512,7 @@ BERT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -505,12 +523,13 @@ BERT:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     type_vocab_size:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -544,7 +563,7 @@ BagEmbedWeighted:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -620,7 +639,7 @@ BagEmbedWeighted:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -724,7 +743,7 @@ BagEmbedWeighted:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -801,7 +820,7 @@ BagEmbedWeighted:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -824,7 +843,7 @@ BagEmbedWeighted:
             model may have a head start in its representation of various input entities.
         example_value:
             - ~/Downloads/glove.6B.100d.txt
-        expected_impact: 2
+        expected_impact: 0
         related_parameters:
             - embedding_size, embeddings_trainable
         ui_display_name: Pretrained embeddings path
@@ -895,7 +914,7 @@ BagEmbedWeighted:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -947,7 +966,7 @@ CTRL:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_ctx:
         ui_display_name: null
@@ -963,8 +982,10 @@ CTRL:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     resid_pdrop:
         ui_display_name: null
     saved_weights_in_checkpoint:
@@ -974,7 +995,7 @@ CTRL:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -985,10 +1006,11 @@ CTRL:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -1152,7 +1174,7 @@ CamemBERT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_attention_heads:
         ui_display_name: null
@@ -1166,8 +1188,10 @@ CamemBERT:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -1175,7 +1199,7 @@ CamemBERT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -1186,12 +1210,13 @@ CamemBERT:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     type_vocab_size:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -1274,7 +1299,7 @@ CategoricalEmbed:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -1305,8 +1330,10 @@ CategoricalEmbed:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -1315,7 +1342,6 @@ CategoricalEmbed:
             - a
             - b
             - c
-        expected_impact: 2
         internal_only: true
         ui_display_name: Not Displayed
 CategoricalSparse:
@@ -1386,7 +1412,7 @@ CategoricalSparse:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -1417,8 +1443,10 @@ CategoricalSparse:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -1447,7 +1475,7 @@ DateEmbed:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -1523,7 +1551,7 @@ DateEmbed:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -1598,7 +1626,7 @@ DateEmbed:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -1675,7 +1703,7 @@ DateEmbed:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -1730,7 +1758,7 @@ DateEmbed:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -1762,7 +1790,7 @@ DateWave:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -1864,7 +1892,7 @@ DateWave:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -1941,7 +1969,7 @@ DateWave:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -1996,7 +2024,7 @@ DateWave:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -2026,7 +2054,7 @@ DenseEncoder:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -2083,6 +2111,7 @@ DenseEncoder:
             performance.
         ui_display_name: Dropout
     input_size:
+        internal_only: true
         other_information: Internal Only
         related_parameters:
             - "No"
@@ -2101,7 +2130,7 @@ DenseEncoder:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -2159,7 +2188,7 @@ DenseEncoder:
             due to overfitting."
         example_value:
             - 1
-        expected_impact: 1
+        expected_impact: 3
         other_information:
             If you have multiple input features, varying the number
             of layers in the combiner or output feature decoder will have more impact.
@@ -2180,7 +2209,7 @@ DenseEncoder:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -2211,7 +2240,7 @@ DenseEncoder:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -2243,7 +2272,7 @@ DistilBERT:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -2340,7 +2369,7 @@ DistilBERT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_heads:
         ui_display_name: null
@@ -2350,6 +2379,7 @@ DistilBERT:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     qa_dropout:
         default_value_reasoning: Huggingface default.
         description_implications:
@@ -2375,6 +2405,7 @@ DistilBERT:
         ui_display_name: qa_dropout
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -2382,7 +2413,7 @@ DistilBERT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -2422,10 +2453,11 @@ DistilBERT:
     sinusoidal_pos_embds:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -2518,7 +2550,7 @@ ELECTRA:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -2624,7 +2656,7 @@ ELECTRA:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_attention_heads:
         ui_display_name: null
@@ -2636,8 +2668,10 @@ ELECTRA:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -2645,7 +2679,7 @@ ELECTRA:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -2656,12 +2690,13 @@ ELECTRA:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     type_vocab_size:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -2688,6 +2723,7 @@ FlauBERT:
             - https://arxiv.org/abs/1912.05372
     asm:
         ui_display_name: null
+        expected_impact: 1
     attention_dropout:
         default_value_reasoning: Huggingface default.
         description_implications:
@@ -2699,7 +2735,7 @@ FlauBERT:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -2713,8 +2749,10 @@ FlauBERT:
         ui_display_name: attention_dropout
     bos_index:
         ui_display_name: null
+        expected_impact: 1
     causal:
         ui_display_name: null
+        expected_impact: 1
     dropout:
         default_value_reasoning: Huggingface default.
         description_implications:
@@ -2740,27 +2778,38 @@ FlauBERT:
         ui_display_name: dropout
     emb_dim:
         ui_display_name: null
+        expected_impact: 1
     embed_init_std:
         ui_display_name: null
+        expected_impact: 1
     eos_index:
         ui_display_name: null
+        expected_impact: 1
     gelu_activation:
         ui_display_name: null
+        expected_impact: 1
     init_std:
         ui_display_name: null
+        expected_impact: 1
     is_encoder:
         ui_display_name: null
+        expected_impact: 1
     lang_id:
         ui_display_name: null
+        expected_impact: 1
     layer_norm_eps:
         ui_display_name: null
+        expected_impact: 1
     layerdrop:
         ui_display_name: null
+        expected_impact: 1
     mask_index:
         ui_display_name: null
+        expected_impact: 1
     mask_token_id:
         default_value_reasoning: Default value used in pre-trained HF encoder.
         ui_display_name: Mask Token ID
+        expected_impact: 1
     max_position_embeddings:
         default_value_reasoning: Taken from huggingface.
         description_implications:
@@ -2773,7 +2822,7 @@ FlauBERT:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality."
-        expected_impact: 2
+        expected_impact: 1
         suggested_values: 512
         suggested_values_reasoning:
             Out of the box value based on published literature.
@@ -2784,26 +2833,33 @@ FlauBERT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_head:
         ui_display_name: null
+        expected_impact: 1
     n_langs:
         default_value_reasoning: Default value used in pre-trained HF encoder.
         expected_impact: 1
         ui_display_name: Number of Languages
     n_layer:
         ui_display_name: null
+        expected_impact: 1
     pad_index:
         ui_display_name: null
+        expected_impact: 1
     pre_norm:
         ui_display_name: null
+        expected_impact: 1
     pretrained_kwargs:
         ui_display_name: null
+        expected_impact: 1
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -2811,7 +2867,7 @@ FlauBERT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -2823,15 +2879,19 @@ FlauBERT:
         ui_display_name: null
     sinusoidal_embeddings:
         ui_display_name: null
+        expected_impact: 1
     trainable:
-        expected_impact: 2
         ui_display_name: null
+        expected_impact: 3
     unk_index:
         ui_display_name: null
+        expected_impact: 1
     use_lang_emb:
         ui_display_name: null
+        expected_impact: 1
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -2883,7 +2943,7 @@ GPT2:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_ctx:
         ui_display_name: null
@@ -2901,17 +2961,20 @@ GPT2:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     resid_pdrop:
         ui_display_name: null
     scale_attn_weights:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -2963,7 +3026,7 @@ GPT:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_ctx:
         ui_display_name: null
@@ -2979,8 +3042,10 @@ GPT:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     resid_pdrop:
         ui_display_name: null
     saved_weights_in_checkpoint:
@@ -2990,7 +3055,7 @@ GPT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -3001,10 +3066,11 @@ GPT:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -3013,7 +3079,6 @@ GPT:
             - a
             - b
             - c
-        expected_impact: 2
         internal_only: true
         ui_display_name: Not Displayed
     vocab_size:
@@ -3037,7 +3102,7 @@ H3Embed:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -3113,7 +3178,7 @@ H3Embed:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -3188,7 +3253,7 @@ H3Embed:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -3265,7 +3330,7 @@ H3Embed:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -3306,6 +3371,7 @@ H3Embed:
         ui_display_name: Sequence Reducer
     use_bias:
         ui_display_name: null
+        expected_impact: 1
     weights_initializer:
         default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf).
         description_implications:
@@ -3321,7 +3387,7 @@ H3Embed:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -3347,6 +3413,7 @@ H3RNN:
             encoding the path in the tree of all H3 hexes.
     activation:
         ui_display_name: null
+        expected_impact: 1
     bias_initializer:
         default_value_reasoning:
             It is possible and common to initialize the biases
@@ -3358,7 +3425,7 @@ H3RNN:
             constant value such as 0.01 for all biases to ensure that all ReLU units
             are activated in the beginning and have some effect on the gradient. However,
             it's still an open question as to whether this provides consistent improvement.
-        expected_impact: 1
+        expected_impact: 2
         literature_references:
             - https://cs231n.github.io/neural-networks-2/
         related_parameters:
@@ -3382,7 +3449,7 @@ H3RNN:
             Setting bidirectional to True may increase the compute
             and memory requirements of the model, but may also increase model performance
             on long sequences.
-        expected_impact: 3
+        expected_impact: 0
         literature_references:
             - https://devopedia.org/bidirectional-rnn#:~:text=RNN%20has%20the%20limitation%20that,forward%20and%20reverse%20time%20order.
         related_parameters:
@@ -3404,7 +3471,7 @@ H3RNN:
             (1) compute costs and (2) catastrophic forgetting (source: https://en.wikipedia.org/wiki/Catastrophic_interference
             ). RNNs have marginally less compute costs, but are prone to catastrophic
             forgetting."
-        expected_impact: 1
+        expected_impact: 3
         related_parameters:
             - "bidirectional
 
@@ -3460,7 +3527,7 @@ H3RNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -3514,7 +3581,7 @@ H3RNN:
             performance for longer sequences or more complex tasks.
         example_value:
             - 1
-        expected_impact: 1
+        expected_impact: 3
         other_information:
             If you have multiple input features, varying the number
             of layers in the combiner or output feature decoder will have more impact.
@@ -3528,7 +3595,7 @@ H3RNN:
         ui_display_name: Number of Recurrent Layers
     recurrent_activation:
         default_value_reasoning: sigmoid' is commonly used
-        expected_impact: 3
+        expected_impact: 1
         other_information:
             I don't think that this parameter is used anywhere in the
             code base. It's being passed down but not used in the actual RNN forwarding
@@ -3549,7 +3616,7 @@ H3RNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -3563,10 +3630,13 @@ H3RNN:
         ui_display_name: Recurrent Dropout
     recurrent_initializer:
         ui_display_name: null
+        expected_impact: 1
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     unit_forget_bias:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         ui_display_name: null
     weights_initializer:
@@ -3584,7 +3654,7 @@ H3RNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -3616,7 +3686,7 @@ H3WeightedSum:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -3692,7 +3762,7 @@ H3WeightedSum:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -3767,7 +3837,7 @@ H3WeightedSum:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -3844,7 +3914,7 @@ H3WeightedSum:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -3860,8 +3930,10 @@ H3WeightedSum:
         ui_display_name: Output Size
     should_softmax:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         ui_display_name: null
+        expected_impact: 1
     weights_initializer:
         default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf).
         description_implications:
@@ -3877,7 +3949,7 @@ H3WeightedSum:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -3907,7 +3979,7 @@ Longformer:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_tokens:
         ui_display_name: null
@@ -3915,8 +3987,10 @@ Longformer:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -3924,7 +3998,7 @@ Longformer:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -3937,10 +4011,11 @@ Longformer:
     sep_token_id:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -4008,7 +4083,7 @@ MLPMixer:
             performance for larger images or more complex image tasks.
         example_value:
             - 8
-        expected_impact: 1
+        expected_impact: 3
         literature_references:
             - "MLP-Mixer: An all-MLP Architecture for Vision
 
@@ -4102,7 +4177,7 @@ MT5:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_decoder_layers:
         ui_display_name: null
@@ -4121,7 +4196,7 @@ MT5:
             from the pre-trained model."
         example_value:
             - 8
-        expected_impact: 1
+        expected_impact: 3
         related_parameters:
             - pretrained_model_or_path
         suggested_values: 1 - 12
@@ -4136,8 +4211,10 @@ MT5:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     relative_attention_num_buckets:
         ui_display_name: null
     saved_weights_in_checkpoint:
@@ -4147,7 +4224,7 @@ MT5:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -4187,12 +4264,13 @@ MT5:
     tokenizer_class:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_cache:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -4228,7 +4306,7 @@ ParallelCNN:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -4320,7 +4398,7 @@ ParallelCNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -4351,6 +4429,7 @@ ParallelCNN:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_layers:
         default_value_reasoning:
             By default the stack is built by using num_fc_layers,
@@ -4387,11 +4466,12 @@ ParallelCNN:
         ui_display_name: Fully Connected Layers
     filter_size:
         ui_display_name: null
+        expected_impact: 2
     max_sequence_length:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -4405,7 +4485,7 @@ ParallelCNN:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -4453,7 +4533,7 @@ ParallelCNN:
             achieve better performance when a large amount of data is provided, but
             also makes the model more computationally expensive and potentially more
             prone to overfitting.
-        expected_impact: 2
+        expected_impact: 3
         related_parameters:
             - conv_layers
         ui_display_name: Number of Convolutional Layers
@@ -4495,7 +4575,7 @@ ParallelCNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -4511,14 +4591,19 @@ ParallelCNN:
         ui_display_name: Output Size
     pool_function:
         ui_display_name: Pooling function
+        expected_impact: 1
     pool_size:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
@@ -4574,7 +4659,7 @@ ParallelCNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -4597,6 +4682,7 @@ PassthroughEncoder:
             placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is
             the batch size.
     input_size:
+        internal_only: true
         other_information: Internal Only
         related_parameters:
             - "No"
@@ -4610,6 +4696,7 @@ BinaryPassthroughEncoder:
             placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is
             the batch size.
     input_size:
+        internal_only: true
         other_information: Internal Only
         related_parameters:
             - "No"
@@ -4623,6 +4710,7 @@ CategoricalPassthroughEncoder:
             placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is
             the batch size.
     input_size:
+        internal_only: true
         other_information: Internal Only
         related_parameters:
             - "No"
@@ -4644,7 +4732,7 @@ ResNet:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -4742,8 +4830,10 @@ ResNet:
         ui_display_name: Fully Connected Layers
     first_pool_kernel_size:
         ui_display_name: null
+        expected_impact: 1
     first_pool_stride:
         ui_display_name: null
+        expected_impact: 1
     height:
         internal_only: true
         ui_display_name: null
@@ -4761,7 +4851,7 @@ ResNet:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -4842,7 +4932,7 @@ ResNet:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -4860,6 +4950,7 @@ ResNet:
         ui_display_name: null
     use_bias:
         ui_display_name: null
+        expected_impact: 1
     weights_initializer:
         default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf).
         description_implications:
@@ -4875,7 +4966,7 @@ ResNet:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -4916,7 +5007,7 @@ RoBERTa:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     pad_token_id:
         ui_display_name: null
@@ -4924,8 +5015,10 @@ RoBERTa:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -4933,7 +5026,7 @@ RoBERTa:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -4944,10 +5037,11 @@ RoBERTa:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -5015,7 +5109,7 @@ SequenceEmbed:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -5046,18 +5140,22 @@ SequenceEmbed:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     max_sequence_length:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -5084,7 +5182,7 @@ SequenceEmbed:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -5125,10 +5223,11 @@ SequencePassthrough:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
 SetSparseEncoder:
     type:
         short_description:
@@ -5148,7 +5247,7 @@ SetSparseEncoder:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -5224,7 +5323,7 @@ SetSparseEncoder:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -5255,6 +5354,7 @@ SetSparseEncoder:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_layers:
         default_value_reasoning:
             By default the stack is built by using num_fc_layers,
@@ -5301,7 +5401,7 @@ SetSparseEncoder:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -5378,7 +5478,7 @@ SetSparseEncoder:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -5394,8 +5494,10 @@ SetSparseEncoder:
         ui_display_name: Output Size
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     representation:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -5448,7 +5550,7 @@ SetSparseEncoder:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -5590,7 +5692,7 @@ Stacked2DCNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -5695,7 +5797,7 @@ Stacked2DCNN:
             achieve better performance when a large amount of data is provided, but
             also makes the model more computationally expensive and potentially more
             prone to overfitting.
-        expected_impact: 2
+        expected_impact: 3
         related_parameters:
             - conv_layers
         ui_display_name: Number of Convolutional Layers
@@ -5738,7 +5840,7 @@ Stacked2DCNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -5846,7 +5948,7 @@ StackedCNN:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -5957,7 +6059,7 @@ StackedCNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -5988,6 +6090,7 @@ StackedCNN:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_layers:
         default_value_reasoning:
             By default the stack is built by using num_fc_layers,
@@ -6024,11 +6127,12 @@ StackedCNN:
         ui_display_name: Fully Connected Layers
     filter_size:
         ui_display_name: null
+        expected_impact: 2
     max_sequence_length:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -6042,7 +6146,7 @@ StackedCNN:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -6090,7 +6194,7 @@ StackedCNN:
             achieve better performance when a large amount of data is provided, but
             also makes the model more computationally expensive and potentially more
             prone to overfitting.
-        expected_impact: 2
+        expected_impact: 3
         related_parameters:
             - conv_layers
         ui_display_name: Number of Convolutional Layers
@@ -6132,7 +6236,7 @@ StackedCNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -6150,18 +6254,25 @@ StackedCNN:
         ui_display_name: null
     pool_function:
         ui_display_name: null
+        expected_impact: 1
     pool_padding:
         ui_display_name: null
+        expected_impact: 1
     pool_size:
         ui_display_name: null
+        expected_impact: 1
     pool_strides:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
@@ -6202,6 +6313,7 @@ StackedCNN:
         ui_display_name: Stride
     use_bias:
         ui_display_name: null
+        expected_impact: 1
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -6210,7 +6322,7 @@ StackedCNN:
             - a
             - b
             - c
-        expected_impact: 2
+        expected_impact: 1
         internal_only: true
         ui_display_name: Not Displayed
     weights_initializer:
@@ -6228,7 +6340,7 @@ StackedCNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -6254,6 +6366,7 @@ StackedCNNRNN:
             last output, but can perform other reduce functions.
     activation:
         ui_display_name: null
+        expected_impact: 2
     bias_initializer:
         default_value_reasoning:
             It is possible and common to initialize the biases
@@ -6283,10 +6396,13 @@ StackedCNNRNN:
         ui_display_name: Bias Initializer
     bidirectional:
         ui_display_name: null
+        expected_impact: 0
     cell_type:
         ui_display_name: null
+        expected_impact: 3
     conv_activation:
         ui_display_name: null
+        expected_impact: 1
     conv_dropout:
         default_value_reasoning:
             Dropout can cause training to become less stable.
@@ -6406,7 +6522,7 @@ StackedCNNRNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -6437,6 +6553,7 @@ StackedCNNRNN:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_activation:
         default_value_reasoning:
             The Rectified Linear Units (ReLU) function is the
@@ -6472,7 +6589,7 @@ StackedCNNRNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -6526,11 +6643,12 @@ StackedCNNRNN:
         ui_display_name: Fully Connected Layers
     filter_size:
         ui_display_name: null
+        expected_impact: 2
     max_sequence_length:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -6544,7 +6662,7 @@ StackedCNNRNN:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -6592,7 +6710,7 @@ StackedCNNRNN:
             achieve better performance when a large amount of data is provided, but
             also makes the model more computationally expensive and potentially more
             prone to overfitting.
-        expected_impact: 2
+        expected_impact: 3
         related_parameters:
             - conv_layers
         ui_display_name: Number of Convolutional Layers
@@ -6636,7 +6754,7 @@ StackedCNNRNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -6654,17 +6772,22 @@ StackedCNNRNN:
         ui_display_name: null
     pool_function:
         ui_display_name: null
+        expected_impact: 1
     pool_padding:
         ui_display_name: null
+        expected_impact: 1
     pool_size:
         ui_display_name: null
+        expected_impact: 1
     pool_strides:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     recurrent_activation:
         default_value_reasoning: sigmoid' is commonly used
-        expected_impact: 3
+        expected_impact: 1
         other_information:
             I don't think that this parameter is used anywhere in the
             code base. It's being passed down but not used in the actual RNN forwarding
@@ -6685,7 +6808,7 @@ StackedCNNRNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -6705,15 +6828,19 @@ StackedCNNRNN:
         ui_display_name: Recurrent Dropout
     recurrent_initializer:
         ui_display_name: null
+        expected_impact: 1
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
     state_size:
         ui_display_name: null
+        expected_impact: 3
     strides:
         default_value_reasoning:
             In general, it makes sense to have a smaller stride
@@ -6751,6 +6878,7 @@ StackedCNNRNN:
         ui_display_name: Stride
     unit_forget_bias:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -6803,7 +6931,7 @@ StackedCNNRNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -6839,7 +6967,7 @@ StackedParallelCNN:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -6915,7 +7043,7 @@ StackedParallelCNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -6946,6 +7074,7 @@ StackedParallelCNN:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_layers:
         default_value_reasoning:
             By default the stack is built by using num_fc_layers,
@@ -6982,11 +7111,12 @@ StackedParallelCNN:
         ui_display_name: Fully Connected Layers
     filter_size:
         ui_display_name: null
+        expected_impact: 2
     max_sequence_length:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -7000,7 +7130,7 @@ StackedParallelCNN:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -7089,7 +7219,7 @@ StackedParallelCNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -7105,14 +7235,19 @@ StackedParallelCNN:
         ui_display_name: Output Size
     pool_function:
         ui_display_name: null
+        expected_impact: 1
     pool_size:
         ui_display_name: null
+        expected_impact: 1
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
@@ -7170,7 +7305,7 @@ StackedParallelCNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -7195,6 +7330,7 @@ StackedRNN:
             operation that by default only returns the last output, but can perform other reduce functions.
     activation:
         ui_display_name: null
+        expected_impact: 2
     bias_initializer:
         default_value_reasoning:
             It is possible and common to initialize the biases
@@ -7224,8 +7360,10 @@ StackedRNN:
         ui_display_name: Bias Initializer
     bidirectional:
         ui_display_name: null
+        expected_impact: 0
     cell_type:
         ui_display_name: null
+        expected_impact: 3
     dropout:
         default_value_reasoning:
             Dropout can cause training to become less stable.
@@ -7276,7 +7414,7 @@ StackedRNN:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -7307,6 +7445,7 @@ StackedRNN:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_activation:
         default_value_reasoning:
             The Rectified Linear Units (ReLU) function is the
@@ -7342,7 +7481,7 @@ StackedRNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -7392,7 +7531,7 @@ StackedRNN:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -7406,7 +7545,7 @@ StackedRNN:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -7483,7 +7622,7 @@ StackedRNN:
             performance for longer sequences or more complex tasks.
         example_value:
             - 1
-        expected_impact: 1
+        expected_impact: 3
         suggested_values: 1-3
         suggested_values_reasoning:
             Increasing the number of layers may improve encoder
@@ -7499,7 +7638,7 @@ StackedRNN:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -7515,9 +7654,10 @@ StackedRNN:
         ui_display_name: Output Size
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     recurrent_activation:
         default_value_reasoning: sigmoid' is commonly used
-        expected_impact: 3
+        expected_impact: 1
         other_information:
             I don't think that this parameter is used anywhere in the
             code base. It's being passed down but not used in the actual RNN forwarding
@@ -7538,7 +7678,7 @@ StackedRNN:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -7556,17 +7696,22 @@ StackedRNN:
         ui_display_name: Recurrent Dropout
     recurrent_initializer:
         ui_display_name: null
+        expected_impact: 1
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
     state_size:
         ui_display_name: null
+        expected_impact: 3
     unit_forget_bias:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -7619,7 +7764,7 @@ StackedRNN:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -7713,7 +7858,7 @@ StackedTransformer:
 
             Increasing the embedding size may cause the model to train more slowly,
             but the higher dimensionality can also improve overall quality.'
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
         suggested_values: 1.6 * sqrt(vocab_size)
@@ -7744,6 +7889,7 @@ StackedTransformer:
         ui_display_name: Embeddings on CPU
     embeddings_trainable:
         ui_display_name: null
+        expected_impact: 1
     fc_activation:
         default_value_reasoning:
             The Rectified Linear Units (ReLU) function is the
@@ -7779,7 +7925,7 @@ StackedTransformer:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         related_parameters:
@@ -7844,7 +7990,7 @@ StackedTransformer:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes and the positional embedding matrix are
             computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     norm:
         default_value_reasoning:
@@ -7858,7 +8004,7 @@ StackedTransformer:
             rate.
         example_value:
             - batch
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/
         related_parameters:
@@ -7940,7 +8086,7 @@ StackedTransformer:
             while providing diminishing returns of model performance."
         example_value:
             - 1
-        expected_impact: 1
+        expected_impact: 3
         suggested_values: 1 - 12
         suggested_values_reasoning:
             Increasing the number of layers may improve encoder
@@ -7956,7 +8102,7 @@ StackedTransformer:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -7972,10 +8118,13 @@ StackedTransformer:
         ui_display_name: Output Size
     pretrained_embeddings:
         ui_display_name: null
+        expected_impact: 0
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     representation:
         ui_display_name: null
+        expected_impact: 1
     should_embed:
         internal_only: true
         ui_display_name: Not displayed
@@ -8054,7 +8203,7 @@ StackedTransformer:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.
 
@@ -8118,7 +8267,7 @@ T5:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     num_decoder_layers:
         ui_display_name: null
@@ -8137,7 +8286,7 @@ T5:
             from the pre-trained model."
         example_value:
             - 6
-        expected_impact: 1
+        expected_impact: 3
         related_parameters:
             - pretrained_model_or_path
         suggested_values: 1 - 12
@@ -8150,8 +8299,10 @@ T5:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     relative_attention_num_buckets:
         ui_display_name: null
     saved_weights_in_checkpoint:
@@ -8161,7 +8312,7 @@ T5:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8172,10 +8323,11 @@ T5:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -8264,7 +8416,7 @@ TransformerXL:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     mem_len:
         ui_display_name: null
@@ -8278,12 +8430,14 @@ TransformerXL:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     proj_init_std:
         ui_display_name: null
     proj_share_all_but_first:
         ui_display_name: null
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     same_length:
         ui_display_name: null
     sample_softmax:
@@ -8295,7 +8449,7 @@ TransformerXL:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8306,12 +8460,13 @@ TransformerXL:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     untie_r:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -8341,7 +8496,7 @@ TVBaseEncoder:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8358,7 +8513,7 @@ TVBaseEncoder:
             and flexibility. If False, less weights are subject to change and the
             model will therefore train faster. However, the representations output
             by this component are fixed for each input.
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - "https://www.ibm.com/cloud/learn/overfitting
 
@@ -8382,7 +8537,7 @@ TVBaseEncoder:
             Pretrained models have typically already learned
             features that are difficult to learn from scratch. They are particularly
             beneficial when training on small amounts of data.
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/transfer-learning-for-deep-learning/
         related_parameters:
@@ -8662,7 +8817,7 @@ ViT:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8679,7 +8834,7 @@ ViT:
             and flexibility. If False, less weights are subject to change and the
             model will therefore train faster. However, the representations output
             by this component are fixed for each input.
-        expected_impact: 2
+        expected_impact: 3
         literature_references:
             - "https://www.ibm.com/cloud/learn/overfitting
 
@@ -8703,7 +8858,7 @@ ViT:
             Pretrained models have typically already learned
             features that are difficult to learn from scratch. They are particularly
             beneficial when training on small amounts of data.
-        expected_impact: 3
+        expected_impact: 2
         literature_references:
             - https://machinelearningmastery.com/transfer-learning-for-deep-learning/
         related_parameters:
@@ -8790,6 +8945,7 @@ XLM:
         ui_display_name: null
     gelu_activation:
         ui_display_name: null
+        expected_impact: 1
     init_std:
         ui_display_name: null
     is_encoder:
@@ -8826,7 +8982,7 @@ XLM:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     n_heads:
         ui_display_name: null
@@ -8844,8 +9000,10 @@ XLM:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -8853,7 +9011,7 @@ XLM:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8868,7 +9026,7 @@ XLM:
     start_n_top:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     unk_index:
         ui_display_name: null
@@ -8876,6 +9034,7 @@ XLM:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -8911,7 +9070,7 @@ XLMRoBERTa:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     pad_token_id:
         ui_display_name: null
@@ -8919,8 +9078,10 @@ XLMRoBERTa:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     saved_weights_in_checkpoint:
         default_value_reasoning:
             The weights of the encoder are not necessarily saved
@@ -8928,7 +9089,7 @@ XLMRoBERTa:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -8939,10 +9100,11 @@ XLMRoBERTa:
             2. the user doesn't have a lot of storage.
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
@@ -9010,6 +9172,7 @@ XLNet:
         ui_display_name: End-of-Sequence Token Id
     ff_activation:
         ui_display_name: null
+        expected_impact: 1
     initializer_range:
         description_implications:
             There is an ideal value for this variable that doesn't
@@ -9031,7 +9194,7 @@ XLNet:
         default_value_reasoning:
             Sets the maximum sequence length of the expected
             inputs, so input/output shapes are computed accurately.
-        expected_impact: 1
+        internal_only: true
         ui_display_name: null
     mem_len:
         ui_display_name: null
@@ -9045,8 +9208,10 @@ XLNet:
         ui_display_name: null
     pretrained_model_name_or_path:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     reuse_len:
         ui_display_name: null
     same_length:
@@ -9058,7 +9223,7 @@ XLNet:
         description_implications:
             The memory footprint for some of these encoders
             can be large.
-        expected_impact: 1
+        internal_only: true
         related_parameters:
             - skip_save_model
         suggested_values:
@@ -9073,6 +9238,7 @@ XLNet:
     summary_activation:
         default_value_reasoning: Default value used in pre-trained HF encoder.
         ui_display_name: Summary Activation Function
+        expected_impact: 1
     summary_last_dropout:
         default_value_reasoning: Huggingface default.
         description_implications:
@@ -9101,7 +9267,7 @@ XLNet:
     summary_use_proj:
         ui_display_name: null
     trainable:
-        expected_impact: 2
+        expected_impact: 3
         ui_display_name: null
     untie_r:
         ui_display_name: null
@@ -9111,6 +9277,7 @@ XLNet:
         ui_display_name: null
     use_pretrained:
         ui_display_name: null
+        expected_impact: 2
     vocab:
         default_value_reasoning:
             Computed and passed along internally according to
diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml
index db686e09ce4..9674fb97bb2 100644
--- a/ludwig/schema/metadata/configs/features.yaml
+++ b/ludwig/schema/metadata/configs/features.yaml
@@ -570,7 +570,7 @@ set:
                 may perform worse when rare tokens appear in the data
             example_value:
                 - 10000
-            expected_impact: 3
+            expected_impact: 2
             other_information: Specifying a vocab_file overrides this parameter
             related_parameters:
                 - vocab_file, pretrained_embeddings
@@ -585,6 +585,7 @@ set:
             ui_display_name: Most common (vocabulary size)
         tokenizer:
             ui_display_name: null
+            expected_impact: 3
 text:
     preprocessing:
         computed_fill_value:

From 94b34ad3011f2ecdaf8ee7d92e09477dafaee6c0 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Wed, 18 Jan 2023 23:27:54 -0700
Subject: [PATCH 07/22] Combiners and Decoders

---
 ludwig/schema/metadata/configs/combiners.yaml | 64 ++++++++++++++-----
 ludwig/schema/metadata/configs/decoders.yaml  | 29 ++++++---
 ludwig/schema/metadata/configs/encoders.yaml  |  9 +++
 3 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/ludwig/schema/metadata/configs/combiners.yaml b/ludwig/schema/metadata/configs/combiners.yaml
index fc5ffaf86a4..4f9f6307f3c 100644
--- a/ludwig/schema/metadata/configs/combiners.yaml
+++ b/ludwig/schema/metadata/configs/combiners.yaml
@@ -20,7 +20,7 @@ ComparatorCombiner:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -80,10 +80,12 @@ ComparatorCombiner:
         literature_references:
             - https://ludwig.ai/0.6/configuration/combiner/#comparator-combiner
         ui_display_name: Entity 1
+        expected_impact: 3
     entity_2:
         literature_references:
             - https://ludwig.ai/0.6/configuration/combiner/#comparator-combiner
         ui_display_name: Entity 2
+        expected_impact: 3
     fc_layers:
         default_value_reasoning:
             By default the stack is built by using num_fc_layers,
@@ -145,6 +147,7 @@ ComparatorCombiner:
         ui_display_name: Normalization Type
     norm_params:
         ui_display_name: null
+        expected_impact: 1
     num_fc_layers:
         default_value_reasoning:
             The encoder already has learnable parameters.Sometimes
@@ -181,7 +184,7 @@ ComparatorCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -235,7 +238,7 @@ ComparatorCombiner:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -269,7 +272,7 @@ ConcatCombiner:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -361,6 +364,7 @@ ConcatCombiner:
         ui_display_name: Fully Connected Layers
     flatten_inputs:
         ui_display_name: null
+        expected_impact: 1
     norm:
         default_value_reasoning:
             While batch normalization and layer normalization
@@ -388,6 +392,7 @@ ConcatCombiner:
         ui_display_name: Normalization Type
     norm_params:
         ui_display_name: null
+        expected_impact: 1
     num_fc_layers:
         default_value_reasoning:
             The encoder already has learnable parameters.Sometimes
@@ -424,7 +429,7 @@ ConcatCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -440,6 +445,7 @@ ConcatCombiner:
         ui_display_name: Output Size
     residual:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -480,7 +486,7 @@ ConcatCombiner:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -508,7 +514,7 @@ ProjectAggregateCombiner:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -625,6 +631,7 @@ ProjectAggregateCombiner:
         ui_display_name: Normalization Type
     norm_params:
         ui_display_name: null
+        expected_impact: 1
     num_fc_layers:
         default_value_reasoning:
             The encoder already has learnable parameters.Sometimes
@@ -661,7 +668,7 @@ ProjectAggregateCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -677,8 +684,10 @@ ProjectAggregateCombiner:
         ui_display_name: Output Size
     projection_size:
         ui_display_name: null
+        expected_impact: 1
     residual:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -719,7 +728,7 @@ ProjectAggregateCombiner:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -745,10 +754,13 @@ SequenceCombiner:
             the outputs for the sequence encoders also apply to the sequence combiner.
     encoder:
         ui_display_name: null
+        expected_impact: 3
     main_sequence_feature:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
 SequenceConcatCombiner:
     type:
         short_description:
@@ -774,8 +786,10 @@ SequenceConcatCombiner:
             the concatenation of the h dimensions of all input features.
     main_sequence_feature:
         ui_display_name: null
+        expected_impact: 3
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
 TabNetCombiner:
     type:
         short_description:
@@ -831,6 +845,7 @@ TabNetCombiner:
             where x_hat is the estimated statistic and x_t is the new observed value."
         suggested_values: 0.01-0.2
         ui_display_name: Batch Norm Momentum
+        expected_impact: 1
     bn_virtual_bs:
         default_value_reasoning: Paper default.
         description_implications:
@@ -846,7 +861,7 @@ TabNetCombiner:
             of data, so the authors use it only in the generator network. A higher
             virtual batch size could improve normalization, but it also causes training
             to run slower since each batch will be sampled multiple times.
-        expected_impact: 2
+        expected_impact: 1
         literature_references:
             - https://paperswithcode.com/method/virtual-batch-normalization
         ui_display_name: "Ghost Normalization: Virtual batch size"
@@ -873,14 +888,19 @@ TabNetCombiner:
         ui_display_name: Dropout
     entmax_alpha:
         ui_display_name: null
+        expected_impact: 1
     entmax_mode:
         ui_display_name: null
+        expected_impact: 1
     num_shared_blocks:
         ui_display_name: null
+        expected_impact: 1
     num_steps:
         ui_display_name: null
+        expected_impact: 1
     num_total_blocks:
         ui_display_name: null
+        expected_impact: 1
     output_size:
         default_value_reasoning: A modest value, not too small, not too large.
         description_implications:
@@ -890,7 +910,7 @@ TabNetCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -906,10 +926,13 @@ TabNetCombiner:
         ui_display_name: Output Size
     relaxation_factor:
         ui_display_name: null
+        expected_impact: 1
     size:
         ui_display_name: null
+        expected_impact: 3
     sparsity:
         ui_display_name: null
+        expected_impact: 1
 TabTransformerCombiner:
     type:
         short_description:
@@ -1005,6 +1028,7 @@ TabTransformerCombiner:
         related_parameters:
             - hidden_size
         ui_display_name: Embed Input Feature Name
+        expected_impact: 3
     fc_activation:
         default_value_reasoning:
             The Rectified Linear Units (ReLU) function is the
@@ -1040,7 +1064,7 @@ TabTransformerCombiner:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         suggested_values: 0.05 - 0.8
@@ -1077,6 +1101,7 @@ TabTransformerCombiner:
         ui_display_name: Fully Connected Layers
     fc_residual:
         ui_display_name: null
+        expected_impact: 1
     hidden_size:
         default_value_reasoning: Not too big, not too small.
         description_implications:
@@ -1118,6 +1143,7 @@ TabTransformerCombiner:
         ui_display_name: Normalization Type
     norm_params:
         ui_display_name: null
+        expected_impact: 1
     num_fc_layers:
         default_value_reasoning:
             The encoder already has learnable parameters.Sometimes
@@ -1193,7 +1219,7 @@ TabTransformerCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -1209,6 +1235,7 @@ TabTransformerCombiner:
         ui_display_name: Output Size
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     transformer_output_size:
         default_value_reasoning: A modest value, not too small, not too large.
         description_implications:
@@ -1272,7 +1299,7 @@ TabTransformerCombiner:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -1384,7 +1411,7 @@ TransformerCombiner:
             \ generalization."
         example_value:
             - 0.2
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
         suggested_values: 0.05 - 0.8
@@ -1462,6 +1489,7 @@ TransformerCombiner:
         ui_display_name: Normalization Type
     norm_params:
         ui_display_name: null
+        expected_impact: 1
     num_fc_layers:
         default_value_reasoning:
             The encoder already has learnable parameters.Sometimes
@@ -1491,6 +1519,7 @@ TransformerCombiner:
         ui_display_name: Number of Fully Connected Layers
     num_heads:
         ui_display_name: null
+        expected_impact: 1
     num_layers:
         default_value_reasoning:
             The ideal number of layers depends on the data. For
@@ -1521,7 +1550,7 @@ TransformerCombiner:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -1537,6 +1566,7 @@ TransformerCombiner:
         ui_display_name: Output Size
     reduce_output:
         ui_display_name: null
+        expected_impact: 1
     transformer_output_size:
         default_value_reasoning: A modest value, not too small, not too large.
         description_implications:
@@ -1600,7 +1630,7 @@ TransformerCombiner:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
diff --git a/ludwig/schema/metadata/configs/decoders.yaml b/ludwig/schema/metadata/configs/decoders.yaml
index e17b0effc9a..f2a70b9a68b 100644
--- a/ludwig/schema/metadata/configs/decoders.yaml
+++ b/ludwig/schema/metadata/configs/decoders.yaml
@@ -34,12 +34,14 @@ Classifier:
         ui_display_name: Bias Initializer
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
     num_classes:
         other_information: Internal Only
         ui_display_name: Not Displayed
+        expected_impact: 3
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -80,7 +82,7 @@ Classifier:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -105,7 +107,7 @@ Projector:
             Changing the activation functions has an impact
             on the computational load of the model and might require further hypterparameter
             tuning
-        expected_impact: 1
+        expected_impact: 2
         suggested_values:
             The default value will work well in the majority of the
             cases
@@ -139,8 +141,10 @@ Projector:
         ui_display_name: Bias Initializer
     clip:
         ui_display_name: null
+        expected_impact: 1
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
@@ -153,7 +157,7 @@ Projector:
             and there's a higher risk of overfitting. If it seems like the model could
             use even more capacity, consider increasing the number of fully connected
             layers, or explore other architectures.
-        expected_impact: 2
+        expected_impact: 3
         other_information:
             If num_fc_layers=0 and fc_layers=None, and there are no
             fully connected layers defined on the module, then this parameter may
@@ -207,7 +211,7 @@ Projector:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -228,6 +232,7 @@ Regressor:
             projection to a single number.
     activation:
         ui_display_name: null
+        expected_impact: 2
     bias_initializer:
         default_value_reasoning:
             It is possible and common to initialize the biases
@@ -257,6 +262,7 @@ Regressor:
         ui_display_name: Bias Initializer
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
@@ -300,7 +306,7 @@ Regressor:
             provides a few good options. See this nice discussion from [Weights and
             Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.)
             for more information.
-        expected_impact: 3
+        expected_impact: 1
         literature_references:
             - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster."
             - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf"
@@ -320,6 +326,7 @@ PassthroughDecoder:
             The passthrough decoder simply returns the raw output coming from the combiner.
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
@@ -344,13 +351,15 @@ SequenceGeneratorDecoder:
             during model building.
     cell_type:
         ui_display_name: null
+        expected_impact: 3
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
     max_sequence_length:
-        expected_impact: 1
+        expected_impact: 3
         ui_display_name: null
     num_layers:
         default_value_reasoning:
@@ -361,7 +370,7 @@ SequenceGeneratorDecoder:
             performance for longer sequences or more complex tasks.
         example_value:
             - 1
-        expected_impact: 1
+        expected_impact: 3
         suggested_values: 1-3
         suggested_values_reasoning:
             Increasing the number of layers may improve encoder
@@ -410,16 +419,19 @@ SequenceTaggerDecoder:
         ui_display_name: Attention Embedding Size
     attention_num_heads:
         ui_display_name: null
+        expected_impact: 1
     input_size:
         other_information: Internal Only
+        internal_only: true
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
     max_sequence_length:
-        expected_impact: 1
+        expected_impact: 3
         ui_display_name: null
     use_attention:
         ui_display_name: null
+        expected_impact: 1
     use_bias:
         default_value_reasoning:
             "Bias terms may improve model accuracy, and don't
@@ -448,3 +460,4 @@ SequenceTaggerDecoder:
         ui_display_name: Use Bias
     vocab_size:
         ui_display_name: Not displayed
+        internal_only: true
diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml
index f66e61739e3..a3072c18d46 100644
--- a/ludwig/schema/metadata/configs/encoders.yaml
+++ b/ludwig/schema/metadata/configs/encoders.yaml
@@ -394,6 +394,7 @@ BERT:
         ui_display_name: classifier_dropout
     gradient_checkpointing:
         ui_display_name: null
+        expected_impact: 1
     hidden_act:
         default_value_reasoning: Taken from huggingface.
         description_implications:
@@ -462,8 +463,10 @@ BERT:
         ui_display_name: null
     intermediate_size:
         ui_display_name: null
+        expected_impact: 1
     layer_norm_eps:
         ui_display_name: null
+        expected_impact: 1
     max_position_embeddings:
         default_value_reasoning: Taken from huggingface.
         description_implications:
@@ -491,14 +494,19 @@ BERT:
         ui_display_name: null
     num_attention_heads:
         ui_display_name: null
+        expected_impact: 1
     num_hidden_layers:
         ui_display_name: null
+        expected_impact: 1
     pad_token_id:
         ui_display_name: null
+        expected_impact: 1
     position_embedding_type:
         ui_display_name: null
+        expected_impact: 1
     pretrained_kwargs:
         ui_display_name: null
+        expected_impact: 1
     pretrained_model_name_or_path:
         ui_display_name: null
         expected_impact: 3
@@ -527,6 +535,7 @@ BERT:
         ui_display_name: null
     type_vocab_size:
         ui_display_name: null
+        expected_impact: 1
     use_pretrained:
         ui_display_name: null
         expected_impact: 2

From ff40d63c8046e78ed8888c64b53df9431ade3661 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 00:09:13 -0700
Subject: [PATCH 08/22] Fix parameter metadata not showing up for some params

---
 ludwig/schema/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py
index d95cf096527..7bd66b7cec5 100644
--- a/ludwig/schema/utils.py
+++ b/ludwig/schema/utils.py
@@ -254,9 +254,12 @@ def String(
                 allow_none=allow_none,
                 load_default=default,
                 dump_default=default,
-                metadata={"description": description},
+                metadata={
+                    "description": description,
+                    "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None
+                },
             ),
-            "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None,
+            # "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None,
         },
         default=default,
     )

From 195624d896b8ea8d3604cea3889afc18a0efe6f2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 07:10:07 +0000
Subject: [PATCH 09/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ludwig/schema/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py
index 7bd66b7cec5..17238e70ea5 100644
--- a/ludwig/schema/utils.py
+++ b/ludwig/schema/utils.py
@@ -256,7 +256,7 @@ def String(
                 dump_default=default,
                 metadata={
                     "description": description,
-                    "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None
+                    "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None,
                 },
             ),
             # "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None,

From 1ea2a91e778b64ce5003ef0f468b60992201789a Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 08:35:03 -0700
Subject: [PATCH 10/22] fix

---
 ludwig/schema/encoders/sequence_encoders.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ludwig/schema/encoders/sequence_encoders.py b/ludwig/schema/encoders/sequence_encoders.py
index c244a4bb59e..ab2ae41ecba 100644
--- a/ludwig/schema/encoders/sequence_encoders.py
+++ b/ludwig/schema/encoders/sequence_encoders.py
@@ -168,7 +168,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=1,
+        default=None,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["ParallelCNN"]["num_conv_layers"],
     )
@@ -336,7 +336,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=1,
+        default=None,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["StackedCNN"]["num_conv_layers"],
     )
@@ -1063,7 +1063,7 @@ def module_name():
     )
 
     num_conv_layers: int = schema_utils.PositiveInteger(
-        default=1,
+        default=None,
         description="Number of parallel convolutional layers to use.",
         parameter_metadata=ENCODER_METADATA["StackedCNNRNN"]["num_conv_layers"],
     )

From 4c7f2e11a37cc1c5a54fb8bd5d94af26d4777fdc Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 08:57:28 -0700
Subject: [PATCH 11/22] Fix some missing param metadata

---
 ludwig/schema/decoders/base.py |  1 +
 ludwig/schema/encoders/base.py | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index 9c16ceee4f5..cc2bcf6f751 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -75,6 +75,7 @@ def module_name(cls):
     input_size: int = schema_utils.PositiveInteger(
         default=1,
         description="Size of the input to the decoder.",
+        parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["input_size"],
     )
 
 
diff --git a/ludwig/schema/encoders/base.py b/ludwig/schema/encoders/base.py
index 92595db65bc..1ca3dac0d58 100644
--- a/ludwig/schema/encoders/base.py
+++ b/ludwig/schema/encoders/base.py
@@ -54,36 +54,43 @@ def module_name():
         min=0,
         max=1,
         description="Dropout rate.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["dropout"],
     )
 
     activation: str = schema_utils.StringOptions(
         ["elu", "leakyRelu", "logSigmoid", "relu", "sigmoid", "tanh", "softmax"],
         default="relu",
         description="Activation function to apply to the output.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["activation"],
     )
 
     input_size: int = schema_utils.PositiveInteger(
         default=None,
         description="Size of the input to the dense encoder.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["input_size"],
     )
 
     output_size: int = schema_utils.PositiveInteger(
         default=256,
         description="Size of the output of the feature.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["output_size"],
     )
 
     use_bias: bool = schema_utils.Boolean(
         default=True,
         description="Whether the layer uses a bias vector.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["use_bias"],
     )
 
     bias_initializer: Union[str, dict] = schema_utils.InitializerOptions(
         default="zeros",
         description="Initializer for the bias vector.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["bias_initializer"],
     )
 
     weights_initializer: Union[str, dict] = schema_utils.InitializerOptions(
         description="Initializer for the weight matrix.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["weights_initializer"],
     )
 
     norm: str = schema_utils.StringOptions(
@@ -91,19 +98,23 @@ def module_name():
         allow_none=True,
         default=None,
         description="Normalization to use in the dense layer.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["norm"],
     )
 
     norm_params: dict = schema_utils.Dict(
         default=None,
         description="Parameters for normalization if norm is either batch or layer.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["norm_params"],
     )
 
     num_layers: int = schema_utils.PositiveInteger(
         default=1,
         description="Number of stacked fully connected layers that the input to the feature passes through.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["num_layers"],
     )
 
     fc_layers: List[dict] = schema_utils.DictList(
         default=None,
         description="List of fully connected layers to use in the encoder.",
+        parameter_metadata=ENCODER_METADATA["DenseEncoder"]["fc_layers"],
     )

From f0b294ba75ad92675f7dd5f233d13c357f804645 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 09:02:41 -0700
Subject: [PATCH 12/22] fix

---
 ludwig/schema/metadata/configs/encoders.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml
index a3072c18d46..0a6b587cb2b 100644
--- a/ludwig/schema/metadata/configs/encoders.yaml
+++ b/ludwig/schema/metadata/configs/encoders.yaml
@@ -2125,8 +2125,9 @@ DenseEncoder:
         related_parameters:
             - "No"
         ui_display_name: Not Displayed
-    layers:
+    fc_layers:
         ui_display_name: null
+        expected_impact: 1
     norm:
         default_value_reasoning:
             While batch normalization and layer normalization
@@ -2234,6 +2235,7 @@ DenseEncoder:
         ui_display_name: Output Size
     use_bias:
         ui_display_name: null
+        expected_impact: 1
     weights_initializer:
         default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf).
         description_implications:

From 156a2f21873880bfb44a0d5ea56c221e518a9017 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 09:38:56 -0700
Subject: [PATCH 13/22] Combiner metadata fix

---
 ludwig/schema/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py
index 17238e70ea5..cf4fcebe5f2 100644
--- a/ludwig/schema/utils.py
+++ b/ludwig/schema/utils.py
@@ -801,6 +801,7 @@ def _deserialize(self, value, attr, data, **kwargs):
 
         def _jsonschema_type_mapping(self):
             initializers = list(initializer_registry.keys())
+            param_metadata = convert_metadata_to_json(parameter_metadata) if parameter_metadata else None
             return {
                 "oneOf": [
                     # Note: default not provided in the custom dict option:
@@ -813,6 +814,7 @@ def _jsonschema_type_mapping(self):
                         "title": f"{self.name}_custom_option",
                         "additionalProperties": True,
                         "description": "Customize an existing initializer.",
+                        "parameter_metadata": param_metadata,
                     },
                     {
                         "type": "string",
@@ -820,6 +822,7 @@ def _jsonschema_type_mapping(self):
                         "default": default,
                         "title": f"{self.name}_preconfigured_option",
                         "description": "Pick a preconfigured initializer.",
+                        "parameter_metadata": param_metadata,
                     },
                 ],
                 "title": self.name,
@@ -830,7 +833,13 @@ def _jsonschema_type_mapping(self):
     return field(
         metadata={
             "marshmallow_field": InitializerOptionsOrCustomDictField(
-                allow_none=False, load_default=default, dump_default=default, metadata={"description": description}
+                allow_none=False,
+                load_default=default,
+                dump_default=default,
+                metadata={
+                    "description": description,
+                    "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None,
+                },
             )
         },
         default=default,

From d507aee1263139f4d3ba93fd3cab892681fd41a7 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 09:56:36 -0700
Subject: [PATCH 14/22] fix decoders

---
 ludwig/schema/decoders/base.py               | 50 +++++++++++++++-----
 ludwig/schema/metadata/configs/decoders.yaml | 29 ++++++++++++
 2 files changed, 68 insertions(+), 11 deletions(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index cc2bcf6f751..cd6631a0b39 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -15,44 +15,71 @@
 class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
     """Base class for decoders."""
 
-    type: str
+    type: str = schema_utils.StringOptions(
+        [],
+        description="The type of decoder to use.",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"]
+    )
 
     fc_layers: List[Dict[str, Any]] = schema_utils.DictList(
-        default=None, description="List of dictionaries containing the parameters for each fully connected layer."
+        default=None,
+        description="List of dictionaries containing the parameters for each fully connected layer.",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"]
     )
 
     num_fc_layers: int = schema_utils.NonNegativeInteger(
-        default=0, description="Number of fully-connected layers if fc_layers not specified."
+        default=0,
+        description="Number of fully-connected layers if fc_layers not specified.",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"]
     )
 
-    fc_output_size: int = schema_utils.PositiveInteger(default=256, description="Output size of fully connected stack.")
+    fc_output_size: int = schema_utils.PositiveInteger(
+        default=256,
+        description="Output size of fully connected stack.",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"]
+    )
 
     fc_use_bias: bool = schema_utils.Boolean(
-        default=True, description="Whether the layer uses a bias vector in the fc_stack."
+        default=True,
+        description="Whether the layer uses a bias vector in the fc_stack.",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"]
     )
 
     fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
-        default="xavier_uniform", description="The weights initializer to use for the layers in the fc_stack"
+        default="xavier_uniform",
+        description="The weights initializer to use for the layers in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
     )
 
     fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
-        default="zeros", description="The bias initializer to use for the layers in the fc_stack"
+        default="zeros",
+        description="The bias initializer to use for the layers in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
     )
 
     fc_norm: str = schema_utils.StringOptions(
-        ["batch", "layer"], description="The normalization to use for the layers in the fc_stack"
+        ["batch", "layer"],
+        description="The normalization to use for the layers in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"]
     )
 
     fc_norm_params: dict = schema_utils.Dict(
-        description="The additional parameters for the normalization in the fc_stack"
+        description="The additional parameters for the normalization in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"]
     )
 
     fc_activation: str = schema_utils.ActivationOptions(
-        default="relu", description="The activation to use for the layers in the fc_stack"
+        default="relu",
+        description="The activation to use for the layers in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"]
     )
 
     fc_dropout: float = schema_utils.FloatRange(
-        default=0.0, min=0, max=1, description="The dropout rate to use for the layers in the fc_stack"
+        default=0.0,
+        min=0,
+        max=1,
+        description="The dropout rate to use for the layers in the fc_stack",
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"]
     )
 
 
@@ -70,6 +97,7 @@ def module_name(cls):
         "passthrough",
         description="The passthrough decoder simply returns the raw numerical values coming from the combiner as "
         "outputs",
+        parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"]
     )
 
     input_size: int = schema_utils.PositiveInteger(
diff --git a/ludwig/schema/metadata/configs/decoders.yaml b/ludwig/schema/metadata/configs/decoders.yaml
index f2a70b9a68b..14de6f92606 100644
--- a/ludwig/schema/metadata/configs/decoders.yaml
+++ b/ludwig/schema/metadata/configs/decoders.yaml
@@ -1,3 +1,26 @@
+BaseDecoder:
+    type:
+        expected_impact: 1
+    fc_layers:
+        expected_impact: 1
+    num_fc_layers:
+        expected_impact: 3
+    fc_output_size:
+        expected_impact: 3
+    fc_use_bias:
+        expected_impact: 1
+    fc_weights_initializer:
+        expected_impact: 1
+    fc_bias_initializer:
+        expected_impact: 1
+    fc_norm:
+        expected_impact: 2
+    fc_norm_params:
+        expected_impact: 1
+    fc_activation:
+        expected_impact: 2
+    fc_dropout:
+        expected_impact: 3
 Classifier:
     type:
         short_description:
@@ -5,6 +28,7 @@ Classifier:
         long_description:
             The classifier decoder is a (potentially empty) stack of fully connected layers, followed by a
             projection into a vector of size of the number of available classes, followed by a sigmoid.
+        expected_impact: 0
     bias_initializer:
         default_value_reasoning:
             It is possible and common to initialize the biases
@@ -102,6 +126,7 @@ Projector:
             The Projector decoder is a (potentially empty) stack of fully connected layers, followed by a
             projection into a tensor of the vector size (optionally followed by a softmax in the case of
             multi-class classification).
+        expected_impact: 0
     activation:
         description_implications:
             Changing the activation functions has an impact
@@ -230,6 +255,7 @@ Regressor:
         long_description:
             The regressor decoder is a (potentially empty) stack of fully connected layers, followed by a
             projection to a single number.
+        expected_impact: 0
     activation:
         ui_display_name: null
         expected_impact: 2
@@ -324,6 +350,7 @@ PassthroughDecoder:
             Provides the raw input from the combiner.
         long_description:
             The passthrough decoder simply returns the raw output coming from the combiner.
+        expected_impact: 0
     input_size:
         other_information: Internal Only
         internal_only: true
@@ -349,6 +376,7 @@ SequenceGeneratorDecoder:
             feature without reduced outputs or the output of a sequence-based combiner. If a `b x h` input
             is provided to a generator decoder using an RNN with attention instead, an error will be raised
             during model building.
+        expected_impact: 0
     cell_type:
         ui_display_name: null
         expected_impact: 3
@@ -405,6 +433,7 @@ SequenceTaggerDecoder:
             a hidden dimension, which is the output of a sequence, text or time series input feature without
             reduced outputs or the output of a sequence-based combiner. If a `b x h` input is provided
             instead, an error will be raised during model building.
+        expected_impact: 0
     attention_embedding_size:
         default_value_reasoning: Not too big, not too small.
         description_implications:

From 4f2d6e1bd3dc653f2dfa7fdc214804113797c7c0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 16:57:33 +0000
Subject: [PATCH 15/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ludwig/schema/decoders/base.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index cd6631a0b39..10228ecfd58 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -16,62 +16,60 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
     """Base class for decoders."""
 
     type: str = schema_utils.StringOptions(
-        [],
-        description="The type of decoder to use.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"]
+        [], description="The type of decoder to use.", parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"]
     )
 
     fc_layers: List[Dict[str, Any]] = schema_utils.DictList(
         default=None,
         description="List of dictionaries containing the parameters for each fully connected layer.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"],
     )
 
     num_fc_layers: int = schema_utils.NonNegativeInteger(
         default=0,
         description="Number of fully-connected layers if fc_layers not specified.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"],
     )
 
     fc_output_size: int = schema_utils.PositiveInteger(
         default=256,
         description="Output size of fully connected stack.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"],
     )
 
     fc_use_bias: bool = schema_utils.Boolean(
         default=True,
         description="Whether the layer uses a bias vector in the fc_stack.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"],
     )
 
     fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
         default="xavier_uniform",
         description="The weights initializer to use for the layers in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"],
     )
 
     fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
         default="zeros",
         description="The bias initializer to use for the layers in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"],
     )
 
     fc_norm: str = schema_utils.StringOptions(
         ["batch", "layer"],
         description="The normalization to use for the layers in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"],
     )
 
     fc_norm_params: dict = schema_utils.Dict(
         description="The additional parameters for the normalization in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"],
     )
 
     fc_activation: str = schema_utils.ActivationOptions(
         default="relu",
         description="The activation to use for the layers in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"],
     )
 
     fc_dropout: float = schema_utils.FloatRange(
@@ -79,7 +77,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
         min=0,
         max=1,
         description="The dropout rate to use for the layers in the fc_stack",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"],
     )
 
 
@@ -97,7 +95,7 @@ def module_name(cls):
         "passthrough",
         description="The passthrough decoder simply returns the raw numerical values coming from the combiner as "
         "outputs",
-        parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"]
+        parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"],
     )
 
     input_size: int = schema_utils.PositiveInteger(

From 1c26d29a7d820a03ddff011a4e7ffd5164de311f Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 10:01:45 -0700
Subject: [PATCH 16/22] fix

---
 ludwig/schema/decoders/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index cd6631a0b39..10342646cdd 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -16,7 +16,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
     """Base class for decoders."""
 
     type: str = schema_utils.StringOptions(
-        [],
+        ["regressor", "classifier", "projector", "generator", "tagger"],
         description="The type of decoder to use.",
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"]
     )

From 71bef9447ab950cf9ff31169cb5eda179700482c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 17:04:29 +0000
Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ludwig/schema/decoders/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index 5eaef9d0139..46ccbfc0fbf 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -18,7 +18,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
     type: str = schema_utils.StringOptions(
         ["regressor", "classifier", "projector", "generator", "tagger"],
         description="The type of decoder to use.",
-        parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"]
+        parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"],
     )
 
     fc_layers: List[Dict[str, Any]] = schema_utils.DictList(

From 3d2c68ed94ca2b5a4950127a3f7924f36b37d8e4 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 10:57:51 -0700
Subject: [PATCH 18/22] Initializer fix

---
 ludwig/schema/decoders/base.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index 5eaef9d0139..34381159104 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -45,15 +45,35 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"],
     )
 
-    fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
+    fc_weights_initializer: Union[str, Dict] = schema_utils.OneOfOptionsField(
         default="xavier_uniform",
         description="The weights initializer to use for the layers in the fc_stack",
+        field_options=[
+            schema_utils.InitializerOptions(
+                description="Preconfigured initializer to use for the layers in the fc_stack.",
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
+            ),
+            schema_utils.Dict(
+                description="Custom initializer to use for the layers in the fc_stack.",
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
+            ),
+        ],
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"],
     )
 
-    fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict(
+    fc_bias_initializer: Union[str, Dict] = schema_utils.OneOfOptionsField(
         default="zeros",
         description="The bias initializer to use for the layers in the fc_stack",
+        field_options=[
+            schema_utils.InitializerOptions(
+                description="Preconfigured bias initializer to use for the layers in the fc_stack.",
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
+            ),
+            schema_utils.Dict(
+                description="Custom bias initializer to use for the layers in the fc_stack.",
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
+            ),
+        ],
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"],
     )
 

From 3e2d32be03fc9cb53807c983726cf6707b1af717 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 17:59:06 +0000
Subject: [PATCH 19/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ludwig/schema/decoders/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py
index 851617281ba..5962c6d0fa6 100644
--- a/ludwig/schema/decoders/base.py
+++ b/ludwig/schema/decoders/base.py
@@ -51,11 +51,11 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
         field_options=[
             schema_utils.InitializerOptions(
                 description="Preconfigured initializer to use for the layers in the fc_stack.",
-                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"],
             ),
             schema_utils.Dict(
                 description="Custom initializer to use for the layers in the fc_stack.",
-                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"]
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"],
             ),
         ],
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"],
@@ -67,11 +67,11 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC):
         field_options=[
             schema_utils.InitializerOptions(
                 description="Preconfigured bias initializer to use for the layers in the fc_stack.",
-                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"],
             ),
             schema_utils.Dict(
                 description="Custom bias initializer to use for the layers in the fc_stack.",
-                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"]
+                parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"],
             ),
         ],
         parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"],

From 9e80f4bea744027f563a7b898d1d551af271a760 Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 11:21:47 -0700
Subject: [PATCH 20/22] Loss param metadata

---
 ludwig/schema/features/loss/loss.py      | 64 +++++++++++++++++++++---
 ludwig/schema/metadata/__init__.py       |  1 +
 ludwig/schema/metadata/configs/loss.yaml | 54 ++++++++++++++++++++
 3 files changed, 111 insertions(+), 8 deletions(-)
 create mode 100644 ludwig/schema/metadata/configs/loss.yaml

diff --git a/ludwig/schema/features/loss/loss.py b/ludwig/schema/features/loss/loss.py
index 1244857fe6b..519f5eb38cf 100644
--- a/ludwig/schema/features/loss/loss.py
+++ b/ludwig/schema/features/loss/loss.py
@@ -14,6 +14,7 @@
     SOFTMAX_CROSS_ENTROPY,
 )
 from ludwig.schema import utils as schema_utils
+from ludwig.schema.metadata import LOSS_METADATA
 
 
 @DeveloperAPI
@@ -37,6 +38,7 @@ class MSELossConfig(BaseLossConfig):
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["MSELoss"]["weight"],
     )
 
 
@@ -51,6 +53,7 @@ class MAELossConfig(BaseLossConfig):
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["MAELoss"]["weight"],
     )
 
 
@@ -65,6 +68,7 @@ class RMSELossConfig(BaseLossConfig):
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["RMSELoss"]["weight"],
     )
 
 
@@ -79,6 +83,7 @@ class RMSPELossConfig(BaseLossConfig):
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["RMSPELoss"]["weight"],
     )
 
 
@@ -93,15 +98,25 @@ class BWCEWLossConfig(BaseLossConfig):
     positive_class_weight: int = schema_utils.NonNegativeInteger(
         default=None,
         description="Weight of the positive class.",
+        parameter_metadata=LOSS_METADATA["BWCEWLoss"]["positive_class_weight"],
     )
 
-    robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="")
+    robust_lambda: int = schema_utils.NonNegativeInteger(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["BWCEWLoss"]["robust_lambda"],
+    )
 
-    confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="")
+    confidence_penalty: float = schema_utils.NonNegativeFloat(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["BWCEWLoss"]["confidence_penalty"],
+    )
 
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["BWCEWLoss"]["weight"],
     )
 
 
@@ -117,24 +132,39 @@ class SoftmaxCrossEntropyLossConfig(BaseLossConfig):
         list_type=float,
         default=None,
         description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_weights"],
     )
 
-    robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="")
+    robust_lambda: int = schema_utils.NonNegativeInteger(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["robust_lambda"],
+    )
 
-    confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="")
+    confidence_penalty: float = schema_utils.NonNegativeFloat(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["confidence_penalty"],
+    )
 
     class_similarities: list = schema_utils.List(
         list,
         default=None,
         description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the "
         "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_similarities"],
     )
 
-    class_similarities_temperature: int = schema_utils.NonNegativeInteger(default=0, description="")
+    class_similarities_temperature: int = schema_utils.NonNegativeInteger(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_similarities_temperature"],
+    )
 
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["weight"],
     )
 
 
@@ -150,29 +180,45 @@ class SequenceSoftmaxCrossEntropyLossConfig(BaseLossConfig):
         list_type=float,
         default=None,
         description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_weights"],
     )
 
-    robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="")
+    robust_lambda: int = schema_utils.NonNegativeInteger(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["robust_lambda"],
+    )
 
-    confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="")
+    confidence_penalty: float = schema_utils.NonNegativeFloat(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["confidence_penalty"],
+    )
 
     class_similarities: list = schema_utils.List(
         list,
         default=None,
         description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the "
         "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_similarities"],
     )
 
-    class_similarities_temperature: int = schema_utils.NonNegativeInteger(default=0, description="")
+    class_similarities_temperature: int = schema_utils.NonNegativeInteger(
+        default=0,
+        description="",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_similarities_temperature"],
+    )
 
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["weight"],
     )
 
     unique: bool = schema_utils.Boolean(
         default=False,
         description="If true, the loss is only computed for unique elements in the sequence.",
+        parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["unique"],
     )
 
 
@@ -188,9 +234,11 @@ class SigmoidCrossEntropyLossConfig(BaseLossConfig):
         list_type=float,
         default=None,
         description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.",
+        parameter_metadata=LOSS_METADATA["SigmoidCrossEntropyLoss"]["class_weights"],
     )
 
     weight: float = schema_utils.NonNegativeFloat(
         default=1.0,
         description="Weight of the loss.",
+        parameter_metadata=LOSS_METADATA["SigmoidCrossEntropyLoss"]["weight"],
     )
diff --git a/ludwig/schema/metadata/__init__.py b/ludwig/schema/metadata/__init__.py
index fbeb96ed964..367a2ba5097 100644
--- a/ludwig/schema/metadata/__init__.py
+++ b/ludwig/schema/metadata/__init__.py
@@ -34,3 +34,4 @@ def _load(fname: str) -> Dict[str, Any]:
 PREPROCESSING_METADATA = _load("preprocessing.yaml")
 TRAINER_METADATA = _load("trainer.yaml")
 OPTIMIZER_METADATA = _load("optimizers.yaml")
+LOSS_METADATA = _load("loss.yaml")
diff --git a/ludwig/schema/metadata/configs/loss.yaml b/ludwig/schema/metadata/configs/loss.yaml
new file mode 100644
index 00000000000..128cd055843
--- /dev/null
+++ b/ludwig/schema/metadata/configs/loss.yaml
@@ -0,0 +1,54 @@
+MSELoss:
+  weight:
+    expected_impact: 3
+MAELoss:
+  weight:
+    expected_impact: 3
+RMSELoss:
+  weight:
+    expected_impact: 3
+RMSPELoss:
+  weight:
+    expected_impact: 3
+BWCEWLoss:
+  positive_class_weight:
+    expected_impact: 3
+  robust_lambda:
+    expected_impact: 2
+  confidence_penalty:
+    expected_impact: 2
+  weight:
+    expected_impact: 3
+SoftmaxCrossEntropyLoss:
+  class_weights:
+    expected_impact: 3
+  robust_lambda:
+    expected_impact: 2
+  confidence_penalty:
+    expected_impact: 2
+  class_similarities:
+    expected_impact: 2
+  class_similarities_temperature:
+    expected_impact: 2
+  weight:
+    expected_impact: 3
+SequenceSoftmaxCrossEntropyLoss:
+  class_weights:
+    expected_impact: 3
+  robust_lambda:
+    expected_impact: 2
+  confidence_penalty:
+    expected_impact: 2
+  class_similarities:
+    expected_impact: 2
+  class_similarities_temperature:
+    expected_impact: 2
+  weight:
+    expected_impact: 3
+  unique:
+    expected_impact: 2
+SigmoidCrossEntropyLoss:
+  class_weights:
+    expected_impact: 3
+  weight:
+    expected_impact: 3

From 90bd37d590069c32122f8b4e8401b703fe4af6be Mon Sep 17 00:00:00 2001
From: connor-mccorm <connor@predibase.com>
Date: Thu, 19 Jan 2023 12:35:04 -0700
Subject: [PATCH 21/22] Output Feature params

---
 ludwig/schema/features/binary_feature.py     |  6 ++
 ludwig/schema/features/category_feature.py   |  6 ++
 ludwig/schema/features/number_feature.py     |  5 ++
 ludwig/schema/features/sequence_feature.py   |  4 ++
 ludwig/schema/features/set_feature.py        |  5 ++
 ludwig/schema/features/text_feature.py       |  5 ++
 ludwig/schema/features/vector_feature.py     |  6 ++
 ludwig/schema/metadata/configs/features.yaml | 60 ++++++++++++++++++++
 8 files changed, 97 insertions(+)

diff --git a/ludwig/schema/features/binary_feature.py b/ludwig/schema/features/binary_feature.py
index c3e922ec361..423d85588b3 100644
--- a/ludwig/schema/features/binary_feature.py
+++ b/ludwig/schema/features/binary_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -73,6 +74,7 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf
     calibration: bool = schema_utils.Boolean(
         default=False,
         description="Calibrate the model's output probabilities using temperature scaling.",
+        parameter_metadata=FEATURE_METADATA[BINARY]["calibration"],
     )
 
     default_validation_metric: str = schema_utils.StringOptions(
@@ -85,6 +87,7 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[BINARY]["dependencies"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="binary_output")
@@ -92,12 +95,14 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[BINARY]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[BINARY]["reduce_input"],
     )
 
     threshold: float = schema_utils.FloatRange(
@@ -106,4 +111,5 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf
         max=1,
         description="The threshold used to convert output probabilities to predictions. Predicted probabilities greater"
         "than or equal to threshold are mapped to True.",
+        parameter_metadata=FEATURE_METADATA[BINARY]["threshold"],
     )
diff --git a/ludwig/schema/features/category_feature.py b/ludwig/schema/features/category_feature.py
index a3ab732d184..4fbb23ea308 100644
--- a/ludwig/schema/features/category_feature.py
+++ b/ludwig/schema/features/category_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -75,6 +76,7 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature
     calibration: bool = schema_utils.Boolean(
         default=False,
         description="Calibrate the model's output probabilities using temperature scaling.",
+        parameter_metadata=FEATURE_METADATA[CATEGORY]["calibration"],
     )
 
     default_validation_metric: str = schema_utils.StringOptions(
@@ -87,6 +89,7 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[CATEGORY]["dependencies"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="category_output")
@@ -94,12 +97,14 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[CATEGORY]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[CATEGORY]["reduce_input"],
     )
 
     top_k: int = schema_utils.NonNegativeInteger(
@@ -107,4 +112,5 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature
         description="Determines the parameter k, the number of categories to consider when computing the top_k "
         "measure. It computes accuracy but considering as a match if the true category appears in the "
         "first k predicted categories ranked by decoder's confidence.",
+        parameter_metadata=FEATURE_METADATA[CATEGORY]["top_k"],
     )
diff --git a/ludwig/schema/features/number_feature.py b/ludwig/schema/features/number_feature.py
index 604583b85dc..2bd51ab4689 100644
--- a/ludwig/schema/features/number_feature.py
+++ b/ludwig/schema/features/number_feature.py
@@ -21,6 +21,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -80,6 +81,7 @@ class NumberOutputFeatureConfig(BaseOutputFeatureConfig, NumberOutputFeatureConf
         min=0,
         max=999999999,
         description="Clip the predicted output to the specified range.",
+        parameter_metadata=FEATURE_METADATA[NUMBER]["clip"],
     )
 
     default_validation_metric: str = schema_utils.StringOptions(
@@ -92,17 +94,20 @@ class NumberOutputFeatureConfig(BaseOutputFeatureConfig, NumberOutputFeatureConf
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[NUMBER]["dependencies"],
     )
 
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[NUMBER]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[NUMBER]["reduce_input"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="number_output")
diff --git a/ludwig/schema/features/sequence_feature.py b/ludwig/schema/features/sequence_feature.py
index 787d34d5293..7c5b00b31a6 100644
--- a/ludwig/schema/features/sequence_feature.py
+++ b/ludwig/schema/features/sequence_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -82,6 +83,7 @@ class SequenceOutputFeatureConfig(BaseOutputFeatureConfig, SequenceOutputFeature
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[SEQUENCE]["dependencies"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="sequence_output")
@@ -89,10 +91,12 @@ class SequenceOutputFeatureConfig(BaseOutputFeatureConfig, SequenceOutputFeature
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[SEQUENCE]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[SEQUENCE]["reduce_input"],
     )
diff --git a/ludwig/schema/features/set_feature.py b/ludwig/schema/features/set_feature.py
index 5c2ea6faf7f..da0faf8e27f 100644
--- a/ludwig/schema/features/set_feature.py
+++ b/ludwig/schema/features/set_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -80,6 +81,7 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[SET]["dependencies"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="set_output")
@@ -87,12 +89,14 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[SET]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[SET]["reduce_input"],
     )
 
     threshold: float = schema_utils.FloatRange(
@@ -101,4 +105,5 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi
         max=1,
         description="The threshold used to convert output probabilities to predictions. Tokens with predicted"
         "probabilities greater than or equal to threshold are predicted to be in the output set (True).",
+        parameter_metadata=FEATURE_METADATA[SET]["threshold"],
     )
diff --git a/ludwig/schema/features/text_feature.py b/ludwig/schema/features/text_feature.py
index 62d1ae39858..a1428b37cb8 100644
--- a/ludwig/schema/features/text_feature.py
+++ b/ludwig/schema/features/text_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -75,6 +76,7 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi
         default=None,
         description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the "
         "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ",
+        parameter_metadata=FEATURE_METADATA[TEXT]["class_similarities"],
     )
 
     default_validation_metric: str = schema_utils.StringOptions(
@@ -87,6 +89,7 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[TEXT]["dependencies"],
     )
 
     preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="text_output")
@@ -94,10 +97,12 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[TEXT]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default="sum",
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[TEXT]["reduce_input"],
     )
diff --git a/ludwig/schema/features/vector_feature.py b/ludwig/schema/features/vector_feature.py
index 160d8035f9d..def4c5ff90a 100644
--- a/ludwig/schema/features/vector_feature.py
+++ b/ludwig/schema/features/vector_feature.py
@@ -19,6 +19,7 @@
     output_mixin_registry,
 )
 from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
+from ludwig.schema.metadata import FEATURE_METADATA
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
@@ -73,6 +74,7 @@ class VectorOutputFeatureConfig(BaseOutputFeatureConfig, VectorOutputFeatureConf
     dependencies: list = schema_utils.List(
         default=[],
         description="List of input features that this feature depends on.",
+        parameter_metadata=FEATURE_METADATA[VECTOR]["dependencies"],
     )
 
     default_validation_metric: str = schema_utils.StringOptions(
@@ -87,22 +89,26 @@ class VectorOutputFeatureConfig(BaseOutputFeatureConfig, VectorOutputFeatureConf
     reduce_dependencies: str = schema_utils.ReductionOptions(
         default=None,
         description="How to reduce the dependencies of the output feature.",
+        parameter_metadata=FEATURE_METADATA[VECTOR]["reduce_dependencies"],
     )
 
     reduce_input: str = schema_utils.ReductionOptions(
         default=None,
         description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first "
         "dimension (second if you count the batch dimension)",
+        parameter_metadata=FEATURE_METADATA[VECTOR]["reduce_input"],
     )
 
     softmax: bool = schema_utils.Boolean(
         default=False,
         description="Determines whether to apply a softmax at the end of the decoder. This is useful for predicting a "
         "vector of values that sum up to 1 and can be interpreted as probabilities.",
+        parameter_metadata=FEATURE_METADATA[VECTOR]["softmax"],
     )
 
     vector_size: int = schema_utils.PositiveInteger(
         default=None,
         allow_none=True,
         description="The size of the vector. If None, the vector size will be inferred from the data.",
+        parameter_metadata=FEATURE_METADATA[VECTOR]["vector_size"],
     )
diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml
index 9674fb97bb2..8fb13109e22 100644
--- a/ludwig/schema/metadata/configs/features.yaml
+++ b/ludwig/schema/metadata/configs/features.yaml
@@ -201,6 +201,16 @@ binary:
                 - fill_value
             ui_display_name: Missing Value Strategy
             expected_impact: 3
+    calibration:
+        expected_impact: 2
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
+    threshold:
+        expected_impact: 3
 category:
     preprocessing:
         computed_fill_value:
@@ -252,6 +262,16 @@ category:
                 will leave out only very rare tokens that should not influence performance
                 substantially
             ui_display_name: Most common (vocabulary size)
+    calibration:
+        expected_impact: 2
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
+    top_k:
+        expected_impact: 3
 date:
     preprocessing:
         computed_fill_value:
@@ -434,6 +454,14 @@ number:
                 \ std = 1. It\u2019s useful when there are a few outliers, but not\
                 \ so extreme that you need clipping."
             ui_display_name: Normalization
+    clip:
+        expected_impact: 2
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
 sequence:
     preprocessing:
         computed_fill_value:
@@ -533,6 +561,12 @@ sequence:
                 rather than treated as an unknown.
             expected_impact: 0
             ui_display_name: Vocab File
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
 set:
     preprocessing:
         computed_fill_value:
@@ -586,6 +620,14 @@ set:
         tokenizer:
             ui_display_name: null
             expected_impact: 3
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
+    threshold:
+        expected_impact: 3
 text:
     preprocessing:
         computed_fill_value:
@@ -753,6 +795,14 @@ text:
                 rather than treated as an unknown.
             expected_impact: 0
             ui_display_name: Vocab File
+    class_similarities:
+        expected_impact: 1
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
 timeseries:
     preprocessing:
         computed_fill_value:
@@ -814,3 +864,13 @@ vector:
         vector_size:
             ui_display_name: null
             expected_impact: 3
+    dependencies:
+        expected_impact: 1
+    reduce_dependencies:
+        expected_impact: 1
+    reduce_input:
+        expected_impact: 1
+    softmax:
+        expected_impact: 3
+    vector_size:
+        expected_impact: 3

From 525c4b7961614b4f12b623169f2339780f8b15c8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 19 Jan 2023 19:36:42 +0000
Subject: [PATCH 22/22] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ludwig/schema/features/binary_feature.py   | 2 +-
 ludwig/schema/features/category_feature.py | 2 +-
 ludwig/schema/features/number_feature.py   | 2 +-
 ludwig/schema/features/sequence_feature.py | 2 +-
 ludwig/schema/features/set_feature.py      | 2 +-
 ludwig/schema/features/text_feature.py     | 2 +-
 ludwig/schema/features/vector_feature.py   | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/ludwig/schema/features/binary_feature.py b/ludwig/schema/features/binary_feature.py
index 423d85588b3..4c852de7f6d 100644
--- a/ludwig/schema/features/binary_feature.py
+++ b/ludwig/schema/features/binary_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/category_feature.py b/ludwig/schema/features/category_feature.py
index 4fbb23ea308..c23c861d674 100644
--- a/ludwig/schema/features/category_feature.py
+++ b/ludwig/schema/features/category_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/number_feature.py b/ludwig/schema/features/number_feature.py
index 2bd51ab4689..e54e32403f3 100644
--- a/ludwig/schema/features/number_feature.py
+++ b/ludwig/schema/features/number_feature.py
@@ -20,8 +20,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/sequence_feature.py b/ludwig/schema/features/sequence_feature.py
index 7c5b00b31a6..d8ea9d2718d 100644
--- a/ludwig/schema/features/sequence_feature.py
+++ b/ludwig/schema/features/sequence_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/set_feature.py b/ludwig/schema/features/set_feature.py
index da0faf8e27f..ff38a562637 100644
--- a/ludwig/schema/features/set_feature.py
+++ b/ludwig/schema/features/set_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/text_feature.py b/ludwig/schema/features/text_feature.py
index a1428b37cb8..b386d13fc41 100644
--- a/ludwig/schema/features/text_feature.py
+++ b/ludwig/schema/features/text_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig
 
 
diff --git a/ludwig/schema/features/vector_feature.py b/ludwig/schema/features/vector_feature.py
index def4c5ff90a..629fda078cb 100644
--- a/ludwig/schema/features/vector_feature.py
+++ b/ludwig/schema/features/vector_feature.py
@@ -18,8 +18,8 @@
     output_config_registry,
     output_mixin_registry,
 )
-from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.metadata import FEATURE_METADATA
+from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY
 from ludwig.schema.utils import BaseMarshmallowConfig