From 755e4d729ae9c1fa364165547e883f4a7c88fb4f Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 18:37:02 -0700 Subject: [PATCH 01/22] Trainer Schema Changes --- ludwig/schema/metadata/configs/trainer.yaml | 36 ++++++++++----------- ludwig/schema/trainer.py | 28 ++++++++-------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml index 7769fc21342..064ecad959c 100644 --- a/ludwig/schema/metadata/configs/trainer.yaml +++ b/ludwig/schema/metadata/configs/trainer.yaml @@ -48,7 +48,7 @@ checkpoints_per_epoch: It is also more engaging and more valuable to ensure a frequent pulse of evaluation metrics, even if they are partial." - expected_impact: 3 + expected_impact: 2 related_parameters: - train_steps - steps_per_checkpoint @@ -73,7 +73,7 @@ early_stop: run is quit. This can be efficient for pruning bad models earlier, but since the training process is inherently non-deterministic and noisy, sometimes improvements happen very gradually over a long period of time. - expected_impact: 2 + expected_impact: 3 related_parameters: - epochs - train_steps @@ -114,7 +114,7 @@ eval_batch_size: maxing out memory limits will speed up the model training process overall. example_value: - 512 - expected_impact: 2 + expected_impact: 1 other_information: Should only set the batch_size to a level that you can fit in memory @@ -139,7 +139,7 @@ evaluate_training_set: training set is large, can be a huge computational cost. Turning off training set evaluation will lead to significant gains in training throughput and efficiency. For small datasets that train and evaluate quickly, the choice is trivial. - expected_impact: 3 + expected_impact: 1 suggested_values: false suggested_values_reasoning: Running full-scale evaluation on the full training @@ -157,7 +157,7 @@ gradient_clipping: gradients in very deep networks. Increasing gradient clipping can help with model training loss curve stability, but it can also make training less efficient as weight at each training step is capped. - expected_impact: 2 + expected_impact: 1 suggested_values_reasoning: It's usually sensible to have some conservative notion of gradient clipping to make modeling robust to a particularly bad or noisy @@ -209,7 +209,7 @@ learning_rate_scaling: can sometimes lead to better model performance. If the learning rate is hand-tuned for a given number of workers, setting this value to constant can be used to disable scale-up. - expected_impact: 2 + expected_impact: 1 suggested_values: linear or sqrt suggested_values_reasoning: Traditionally the learning rate is scaled linearly @@ -226,7 +226,7 @@ max_batch_size: by auto batch size tuning and batch size increasing on plateau. example_value: - 1024 - expected_impact: 2 + expected_impact: 1 related_parameters: - batch_size - increase_batch_size_on_plateau @@ -276,7 +276,7 @@ regularization_lambda: is data-dependent, so you'll need to do some tuning. We recommend trying a handful of values (0.01, 0.02, ... 0.4) gradually increasing the value until training curves get worse" - expected_impact: 3 + expected_impact: 2 literature_references: - "https://developers.google.com/machine-learning/crash-course/regularization-for-simplicity/lambda " related_parameters: @@ -297,7 +297,7 @@ regularization_type: \ selection, since weights are only reduced to values near 0 instead of 0.\ \ L1 regularization has built-in feature selection.\nL1 regularization is\ \ robust to outliers, L2 regularization is not." - expected_impact: 3 + expected_impact: 2 literature_references: - "https://neptune.ai/blog/fighting-overfitting-with-l1-or-l2-regularization#:~:text=The%20differences%20between%20L1%20and,regularization%20solution%20is%20non%2Dsparse. " related_parameters: @@ -311,7 +311,7 @@ should_shuffle: description_implications: Turning off mini-batch shuffling can make training faster, but it may lead to worse performance overall as shuffling helps mitigate overfitting. - expected_impact: 2 + expected_impact: 1 literature_references: - "https://stats.stackexchange.com/questions/245502/why-should-we-shuffle-data-while-training-a-neural-network#:~:text=it%20helps%20the%20training%20converge,the%20order%20of%20the%20training " suggested_values: true @@ -347,7 +347,7 @@ steps_per_checkpoint: It is also more engaging and more valuable to ensure a frequent pulse of evaluation metrics, even if they are partial." - expected_impact: 3 + expected_impact: 1 related_parameters: - checkpoints_per_epoch suggested_values: O(1k) for larger datasets @@ -365,7 +365,7 @@ train_steps: description_implications: Decreasing this will shorten the overall runway for training the model. - expected_impact: 3 + expected_impact: 1 related_parameters: - epochs suggested_values: 0 (and use epochs), or 1000000, 1 for debugging @@ -399,7 +399,7 @@ validation_field: This parameter affects 1) what the early stopping policy looks at to determine when to early stop and 2) hyperparameter optimization for determining the best trial. - expected_impact: 3 + expected_impact: 1 related_parameters: - validation_field - validation_metric @@ -410,7 +410,7 @@ validation_metric: This parameter affects 1) what the early stopping policy looks at to determine when to early stop and 2) hyperparameter optimization for determining the best trial. - expected_impact: 3 + expected_impact: 1 related_parameters: - validation_field - validation_metric @@ -474,7 +474,7 @@ learning_rate_scheduler: \ As a rule of thumb, compared to training without a schedule, you can use\ \ a slightly higher maximum learning rate. Since the learning rate changes\ \ over time, the whole training is not so sensitive to the value picked." - expected_impact: 2 + expected_impact: 3 literature_references: - "https://peltarion.com/knowledge-center/documentation/modeling-view/run-a-model/optimization-principles-(in-deep-learning)/learning-rate-schedule " related_parameters: @@ -498,7 +498,7 @@ learning_rate_scheduler: faster. This could make the model more robust to a bad (too high) initial learning rate, but a decay rate that is too high could prohibit the model from learning anything at all. - expected_impact: 1 + expected_impact: 2 literature_references: - "https://peltarion.com/knowledge-center/documentation/modeling-view/run-a-model/optimization-principles-(in-deep-learning)/learning-rate-schedule " related_parameters: @@ -519,7 +519,7 @@ learning_rate_scheduler: learning rate decays. example_value: - 5000 - expected_impact: 1 + expected_impact: 2 related_parameters: - decay_rate - decay_steps @@ -552,7 +552,7 @@ learning_rate_scheduler: decaying the learning rate is superior to doing so continuously. ui_display_name: Staircase reduce_on_plateau: - expected_impact: 1 + expected_impact: 2 ui_display_name: Reduce On Plateau reduce_on_plateau_patience: expected_impact: 1 diff --git a/ludwig/schema/trainer.py b/ludwig/schema/trainer.py index 8c6945aa353..7abc824ea3f 100644 --- a/ludwig/schema/trainer.py +++ b/ludwig/schema/trainer.py @@ -64,6 +64,20 @@ class ECDTrainerConfig(BaseTrainerConfig): parameter_metadata=TRAINER_METADATA["epochs"], ) + batch_size: Union[int, str] = schema_utils.OneOfOptionsField( + default=DEFAULT_BATCH_SIZE, + allow_none=False, + description=( + "The number of training examples utilized in one training step of the model. If ’auto’, the " + "biggest batch size (power of 2) that can fit in memory will be used." + ), + parameter_metadata=TRAINER_METADATA["batch_size"], + field_options=[ + schema_utils.PositiveInteger(default=128, description="", allow_none=False), + schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False), + ], + ) + checkpoints_per_epoch: int = schema_utils.NonNegativeInteger( default=0, description=( @@ -101,20 +115,6 @@ class ECDTrainerConfig(BaseTrainerConfig): parameter_metadata=TRAINER_METADATA["early_stop"], ) - batch_size: Union[int, str] = schema_utils.OneOfOptionsField( - default=DEFAULT_BATCH_SIZE, - allow_none=False, - description=( - "The number of training examples utilized in one training step of the model. If ’auto’, the " - "biggest batch size (power of 2) that can fit in memory will be used." - ), - parameter_metadata=TRAINER_METADATA["batch_size"], - field_options=[ - schema_utils.PositiveInteger(default=128, description="", allow_none=False), - schema_utils.StringOptions(options=["auto"], default="auto", allow_none=False), - ], - ) - max_batch_size: int = schema_utils.PositiveInteger( default=MAX_POSSIBLE_BATCH_SIZE, allow_none=True, From 6abed485968d27135044dde15dbe5cd6dbd1bb0c Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 18:55:12 -0700 Subject: [PATCH 02/22] Add optimizer EI --- ludwig/schema/optimizers.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py index ebff605dbf5..29d6ac8ac52 100644 --- a/ludwig/schema/optimizers.py +++ b/ludwig/schema/optimizers.py @@ -421,11 +421,26 @@ def _jsonschema_type_mapping(): class GradientClippingConfig(schema_utils.BaseMarshmallowConfig): """Dataclass that holds gradient clipping parameters.""" - clipglobalnorm: Optional[float] = schema_utils.FloatRange(default=0.5, allow_none=True, description="") + clipglobalnorm: Optional[float] = schema_utils.FloatRange( + default=0.5, + allow_none=True, + description="", + parameter_metadata=TRAINER_METADATA["gradient_clipping"] + ) - clipnorm: Optional[float] = schema_utils.FloatRange(default=None, allow_none=True, description="") + clipnorm: Optional[float] = schema_utils.FloatRange( + default=None, + allow_none=True, + description="", + parameter_metadata=TRAINER_METADATA["gradient_clipping"] + ) - clipvalue: Optional[float] = schema_utils.FloatRange(default=None, allow_none=True, description="") + clipvalue: Optional[float] = schema_utils.FloatRange( + default=None, + allow_none=True, + description="", + parameter_metadata=TRAINER_METADATA["gradient_clipping"] + ) @DeveloperAPI From 5ca3330f6ebee8cd5e78f2aaa26f7f526f9017fd Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 19:47:53 -0700 Subject: [PATCH 03/22] Add optimizer metadata' --- ludwig/schema/metadata/__init__.py | 1 + .../schema/metadata/configs/optimizers.yaml | 59 +++++ ludwig/schema/metadata/configs/trainer.yaml | 15 -- ludwig/schema/optimizers.py | 213 ++++++++++++++---- 4 files changed, 223 insertions(+), 65 deletions(-) create mode 100644 ludwig/schema/metadata/configs/optimizers.yaml diff --git a/ludwig/schema/metadata/__init__.py b/ludwig/schema/metadata/__init__.py index 6c93a1f0719..fbeb96ed964 100644 --- a/ludwig/schema/metadata/__init__.py +++ b/ludwig/schema/metadata/__init__.py @@ -33,3 +33,4 @@ def _load(fname: str) -> Dict[str, Any]: FEATURE_METADATA = _load("features.yaml") PREPROCESSING_METADATA = _load("preprocessing.yaml") TRAINER_METADATA = _load("trainer.yaml") +OPTIMIZER_METADATA = _load("optimizers.yaml") diff --git a/ludwig/schema/metadata/configs/optimizers.yaml b/ludwig/schema/metadata/configs/optimizers.yaml new file mode 100644 index 00000000000..f28b246bb8d --- /dev/null +++ b/ludwig/schema/metadata/configs/optimizers.yaml @@ -0,0 +1,59 @@ +gradient_clipping: + default_value_reasoning: + A conservative cap on the maximum gradient size to apply + over a single training step. + description_implications: + Gradient clipping is a technique to prevent exploding + gradients in very deep networks. Increasing gradient clipping can help with + model training loss curve stability, but it can also make training less efficient + as weight at each training step is capped. + expected_impact: 1 + suggested_values_reasoning: + It's usually sensible to have some conservative notion + of gradient clipping to make modeling robust to a particularly bad or noisy + batch of examples. + ui_display_name: Gradient Clipping +momentum: + expected_impact: 1 +weight_decay: + expected_impact: 1 +dampening: + expected_impact: 1 +nesterov: + expected_impact: 1 +max_iter: + expected_impact: 1 +max_eval: + expected_impact: 1 +tolerance_grad: + expected_impact: 1 +tolerance_change: + expected_impact: 1 +history_size: + expected_impact: 1 +line_search_fn: + expected_impact: 1 +betas: + expected_impact: 1 +amsgrad: + expected_impact: 1 +rho: + expected_impact: 1 +initial_accumulator_value: + expected_impact: 1 +lr_decay: + expected_impact: 1 +learning_rate_power: + expected_impact: 1 +l1_regularization_strength: + expected_impact: 1 +l2_regularization_strength: + expected_impact: 1 +momentum_decay: + expected_impact: 1 +alpha: + expected_impact: 1 +eps: + expected_impact: 1 +centered: + expected_impact: 1 \ No newline at end of file diff --git a/ludwig/schema/metadata/configs/trainer.yaml b/ludwig/schema/metadata/configs/trainer.yaml index 064ecad959c..4f87658c399 100644 --- a/ludwig/schema/metadata/configs/trainer.yaml +++ b/ludwig/schema/metadata/configs/trainer.yaml @@ -148,21 +148,6 @@ evaluate_training_set: so it will still be easy to spot signs of overfitting like when the training-validation loss curves diverge. ui_display_name: Evaluate Training Set -gradient_clipping: - default_value_reasoning: - A conservative cap on the maximum gradient size to apply - over a single training step. - description_implications: - Gradient clipping is a technique to prevent exploding - gradients in very deep networks. Increasing gradient clipping can help with - model training loss curve stability, but it can also make training less efficient - as weight at each training step is capped. - expected_impact: 1 - suggested_values_reasoning: - It's usually sensible to have some conservative notion - of gradient clipping to make modeling robust to a particularly bad or noisy - batch of examples. - ui_display_name: Gradient Clipping increase_batch_size_eval_metric: expected_impact: 1 ui_display_name: "Batch Size Increase: Evaluation Metric" diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py index 29d6ac8ac52..25b98f3ef23 100644 --- a/ludwig/schema/optimizers.py +++ b/ludwig/schema/optimizers.py @@ -8,7 +8,7 @@ import ludwig.schema.utils as schema_utils from ludwig.api_annotations import DeveloperAPI -from ludwig.schema.metadata import TRAINER_METADATA +from ludwig.schema.metadata import OPTIMIZER_METADATA from ludwig.schema.metadata.parameter_metadata import convert_metadata_to_json from ludwig.utils.registry import Registry @@ -65,10 +65,26 @@ class SGDOptimizerConfig(BaseOptimizerConfig): 'sgd')""" # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD : - momentum: float = schema_utils.NonNegativeFloat(default=0.0, description="Momentum factor.") - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") - dampening: float = schema_utils.NonNegativeFloat(default=0.0, description="Dampening for momentum.") - nesterov: bool = schema_utils.Boolean(default=False, description="Enables Nesterov momentum.") + momentum: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Momentum factor.", + parameter_metadata=OPTIMIZER_METADATA["momentum"] + ) + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) + dampening: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Dampening for momentum.", + parameter_metadata=OPTIMIZER_METADATA["dampening"] + ) + nesterov: bool = schema_utils.Boolean( + default=False, + description="Enables Nesterov momentum.", + parameter_metadata=OPTIMIZER_METADATA["nesterov"] + ) @DeveloperAPI @@ -85,23 +101,42 @@ class LBFGSOptimizerConfig(BaseOptimizerConfig): 'lbfgs')""" # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.LBFGS.html#torch.optim.LBFGS - max_iter: int = schema_utils.Integer(default=20, description="Maximum number of iterations per optimization step.") + max_iter: int = schema_utils.Integer( + default=20, + description="Maximum number of iterations per optimization step.", + parameter_metadata=OPTIMIZER_METADATA["max_iter"] + ) + max_eval: int = schema_utils.Integer( default=None, allow_none=True, description="Maximum number of function evaluations per optimization step. Default: `max_iter` * 1.25.", + parameter_metadata=OPTIMIZER_METADATA["max_eval"] ) + tolerance_grad: float = schema_utils.NonNegativeFloat( - default=1e-07, description="Termination tolerance on first order optimality." + default=1e-07, + description="Termination tolerance on first order optimality.", + parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"] ) + tolerance_change: float = schema_utils.NonNegativeFloat( - default=1e-09, description="Termination tolerance on function value/parameter changes." + default=1e-09, + description="Termination tolerance on function value/parameter changes.", + parameter_metadata=OPTIMIZER_METADATA["tolerance_change"] + ) + + history_size: int = schema_utils.Integer( + default=100, + description="Update history size.", + parameter_metadata=OPTIMIZER_METADATA["history_size"] ) - history_size: int = schema_utils.Integer(default=100, description="Update history size.") + line_search_fn: str = schema_utils.StringOptions( ["strong_wolfe"], default=None, description="Line search function to use.", + parameter_metadata=OPTIMIZER_METADATA["line_search_fn"] ) @@ -120,21 +155,28 @@ class AdamOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam : betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( - default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square." + default=(0.9, 0.999), + description="Coefficients used for computing running averages of gradient and its square.", + parameter_metadata=OPTIMIZER_METADATA["betas"] ) eps: float = schema_utils.NonNegativeFloat( - default=1e-08, description="Term added to the denominator to improve numerical stability." + default=1e-08, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"] ) - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay (L2 penalty).") + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay (L2 penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) amsgrad: bool = schema_utils.Boolean( default=False, - description=( - "Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam and" - "Beyond'." - ), + description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam " + "and Beyond'.", + parameter_metadata=OPTIMIZER_METADATA["amsgrad"] ) @@ -153,21 +195,28 @@ class AdamWOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam : betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( - default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square." + default=(0.9, 0.999), + description="Coefficients used for computing running averages of gradient and its square.", + parameter_metadata=OPTIMIZER_METADATA["betas"] ) eps: float = schema_utils.NonNegativeFloat( - default=1e-08, description="Term added to the denominator to improve numerical stability." + default=1e-08, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"] ) - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) amsgrad: bool = schema_utils.Boolean( default=False, - description=( - "Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam and " - "Beyond'." - ), + description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam " + "and Beyond'. ", + parameter_metadata=OPTIMIZER_METADATA["amsgrad"] ) @@ -190,13 +239,20 @@ class AdadeltaOptimizerConfig(BaseOptimizerConfig): min=0, max=1, description="Coefficient used for computing a running average of squared gradients.", + parameter_metadata=OPTIMIZER_METADATA["rho"] ) eps: float = schema_utils.NonNegativeFloat( - default=1e-06, description="Term added to the denominator to improve numerical stability." + default=1e-06, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"] ) - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) @DeveloperAPI @@ -214,14 +270,28 @@ class AdagradOptimizerConfig(BaseOptimizerConfig): (default: 'adagrad')""" # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adagrad.html#torch.optim.Adagrad : - initial_accumulator_value: float = schema_utils.NonNegativeFloat(default=0, description="") + initial_accumulator_value: float = schema_utils.NonNegativeFloat( + default=0, + description="", + parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] + ) - lr_decay: float = schema_utils.FloatRange(default=0, description="Learning rate decay.") + lr_decay: float = schema_utils.FloatRange( + default=0, + description="Learning rate decay.", + parameter_metadata=OPTIMIZER_METADATA["lr_decay"] + ) - weight_decay: float = schema_utils.FloatRange(default=0, description="Weight decay ($L2$ penalty).") + weight_decay: float = schema_utils.FloatRange( + default=0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) eps: float = schema_utils.FloatRange( - default=1e-10, description="Term added to the denominator to improve numerical stability." + default=1e-10, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"] ) @@ -240,14 +310,22 @@ class AdamaxOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adamax.html#torch.optim.Adamax : betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( - default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square." + default=(0.9, 0.999), + description="Coefficients used for computing running averages of gradient and its square.", + parameter_metadata=OPTIMIZER_METADATA["betas"] ) eps: float = schema_utils.NonNegativeFloat( - default=1e-08, description="Term added to the denominator to improve numerical stability." + default=1e-08, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"] ) - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) # NOTE: keep ftrl and nadam optimizers out of registry: @@ -258,13 +336,26 @@ class FtrlOptimizerConfig(BaseOptimizerConfig): # optimizer_class: ClassVar[torch.optim.Optimizer] = torch.optim.Ftrl type: str = schema_utils.ProtectedString("ftrl") - learning_rate_power: float = schema_utils.FloatRange(default=-0.5, max=0.0) + learning_rate_power: float = schema_utils.FloatRange( + default=-0.5, + max=0, + parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"] + ) - initial_accumulator_value: float = schema_utils.NonNegativeFloat(default=0.1) + initial_accumulator_value: float = schema_utils.NonNegativeFloat( + default=0.1, + parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] + ) - l1_regularization_strength: float = schema_utils.NonNegativeFloat(default=0.0) + l1_regularization_strength: float = schema_utils.NonNegativeFloat( + default=0.0, + parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"] + ) - l2_regularization_strength: float = schema_utils.NonNegativeFloat(default=0.0) + l2_regularization_strength: float = schema_utils.NonNegativeFloat( + default=0.0, + parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"] + ) @DeveloperAPI @@ -279,16 +370,28 @@ class NadamOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.NAdam.html#torch.optim.NAdam : betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( - default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square." + default=(0.9, 0.999), + description="Coefficients used for computing running averages of gradient and its square.", + parameter_metadata=OPTIMIZER_METADATA["betas"], ) eps: float = schema_utils.NonNegativeFloat( - default=1e-08, description="Term added to the denominator to improve numerical stability." + default=1e-08, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"], ) - weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") + weight_decay: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Weight decay ($L2$ penalty).", + parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + ) - momentum_decay: float = schema_utils.NonNegativeFloat(default=4e-3, description="Momentum decay.") + momentum_decay: float = schema_utils.NonNegativeFloat( + default=4e-3, + description="Momentum decay.", + parameter_metadata=OPTIMIZER_METADATA["momentum_decay"] + ) @DeveloperAPI @@ -305,19 +408,29 @@ class RMSPropOptimizerConfig(BaseOptimizerConfig): (default: 'rmsprop')""" # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.RMSprop.html#torch.optim.RMSprop: - momentum: float = schema_utils.NonNegativeFloat(default=0.0, description="Momentum factor.") + momentum: float = schema_utils.NonNegativeFloat( + default=0.0, + description="Momentum factor.", + parameter_metadata=OPTIMIZER_METADATA["momentum"], + ) - alpha: float = schema_utils.NonNegativeFloat(default=0.99, description="Smoothing constant.") + alpha: float = schema_utils.NonNegativeFloat( + default=0.99, + description="Smoothing constant.", + parameter_metadata=OPTIMIZER_METADATA["alpha"], + ) eps: float = schema_utils.NonNegativeFloat( - default=1e-08, description="Term added to the denominator to improve numerical stability." + default=1e-08, + description="Term added to the denominator to improve numerical stability.", + parameter_metadata=OPTIMIZER_METADATA["eps"], ) centered: bool = schema_utils.Boolean( default=False, - description=( - "If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its variance." - ), + description="If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its " + "variance.", + parameter_metadata=OPTIMIZER_METADATA["centered"], ) weight_decay: float = schema_utils.NonNegativeFloat(default=0.0, description="Weight decay ($L2$ penalty).") @@ -425,21 +538,21 @@ class GradientClippingConfig(schema_utils.BaseMarshmallowConfig): default=0.5, allow_none=True, description="", - parameter_metadata=TRAINER_METADATA["gradient_clipping"] + parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) clipnorm: Optional[float] = schema_utils.FloatRange( default=None, allow_none=True, description="", - parameter_metadata=TRAINER_METADATA["gradient_clipping"] + parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) clipvalue: Optional[float] = schema_utils.FloatRange( default=None, allow_none=True, description="", - parameter_metadata=TRAINER_METADATA["gradient_clipping"] + parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) @@ -500,7 +613,7 @@ def _jsonschema_type_mapping(): dump_default=dump_default, metadata={ "description": description, - "parameter_metadata": convert_metadata_to_json(TRAINER_METADATA["gradient_clipping"]), + "parameter_metadata": convert_metadata_to_json(OPTIMIZER_METADATA["gradient_clipping"]), }, ) }, From 828b8766ef023907593c5015d97ee07e3b910ab7 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 20:55:08 -0700 Subject: [PATCH 04/22] Feature preprocessing done --- ludwig/schema/metadata/configs/features.yaml | 91 +++++++++++++++----- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml index c1a0289f997..db686e09ce4 100644 --- a/ludwig/schema/metadata/configs/features.yaml +++ b/ludwig/schema/metadata/configs/features.yaml @@ -2,14 +2,16 @@ audio: preprocessing: audio_file_length_limit_in_s: ui_display_name: null + expected_impact: 2 computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 ui_display_name: Fill Value + expected_impact: 1 in_memory: ui_display_name: null + expected_impact: 1 missing_value_strategy: default_value_reasoning: The default `fill_with_const` replaces missing @@ -37,7 +39,7 @@ audio: learning rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -53,6 +55,7 @@ audio: ui_display_name: Normalization Type num_fft_points: ui_display_name: null + expected_impact: 1 num_filter_bands: literature_references: - "https://medium.com/analytics-vidhya/simplifying-audio-data-fft-stft-mfcc-for-machine-learning-and-deep-learning-443a2f962e0e " @@ -61,8 +64,10 @@ audio: - type - window_shift_in_s ui_display_name: Type + expected_impact: 1 padding_value: ui_display_name: null + expected_impact: 1 type: default_value_reasoning: The default type fbank is set based on values @@ -98,6 +103,7 @@ audio: - type - num_filter_bands ui_display_name: Window Length in Seconds + expected_impact: 2 window_shift_in_s: literature_references: - "https://medium.com/analytics-vidhya/simplifying-audio-data-fft-stft-mfcc-for-machine-learning-and-deep-learning-443a2f962e0e " @@ -106,18 +112,21 @@ audio: - type - num_filter_bands ui_display_name: Window Shift in Seconds + expected_impact: 2 window_type: ui_display_name: null + expected_impact: 2 bag: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 ui_display_name: Fill Value + expected_impact: 1 lowercase: ui_display_name: null + expected_impact: 2 missing_value_strategy: default_value_reasoning: The default `fill_with_const` replaces missing @@ -145,7 +154,7 @@ bag: may perform worse when rare tokens appear in the data example_value: - 10000 - expected_impact: 3 + expected_impact: 2 other_information: Specifying a vocab_file overrides this parameter related_parameters: - vocab_file, pretrained_embeddings @@ -160,6 +169,7 @@ bag: ui_display_name: Most common (vocabulary size) tokenizer: ui_display_name: null + expected_impact: 3 binary: preprocessing: computed_fill_value: @@ -174,7 +184,7 @@ binary: expected_impact: 2 ui_display_name: Fallback True Label fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -190,16 +200,18 @@ binary: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 category: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value lowercase: ui_display_name: null + expected_impact: 2 missing_value_strategy: default_value_reasoning: The default `fill_with_const` replaces missing @@ -214,6 +226,7 @@ category: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 most_common: default_value_reasoning: If there are more than 10000 unique categories @@ -226,7 +239,7 @@ category: may perform worse when rare tokens appear in the data example_value: - 10000 - expected_impact: 3 + expected_impact: 2 other_information: Specifying a vocab_file overrides this parameter related_parameters: - vocab_file, pretrained_embeddings @@ -258,11 +271,11 @@ date: serves as a truncator. example_value: - "%d %b %Y" - expected_impact: 1 + expected_impact: 2 suggested_values_reasoning: Have Ludwig figure out the date format automatically. ui_display_name: Datetime format fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -278,13 +291,14 @@ date: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 h3: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -300,28 +314,36 @@ h3: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 image: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value height: ui_display_name: null + expected_impact: 2 in_memory: ui_display_name: null + expected_impact: 1 infer_image_dimensions: ui_display_name: null + expected_impact: 1 infer_image_max_height: ui_display_name: null + expected_impact: 1 infer_image_max_width: ui_display_name: null + expected_impact: 1 infer_image_num_channels: ui_display_name: null + expected_impact: 1 infer_image_sample_size: ui_display_name: null + expected_impact: 1 missing_value_strategy: default_value_reasoning: The default `fill_with_const` replaces missing @@ -336,10 +358,13 @@ image: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 num_channels: ui_display_name: null + expected_impact: 2 num_processes: ui_display_name: null + expected_impact: 2 resize_method: default_value_reasoning: Interpolation may stretch or squish the image, @@ -358,15 +383,17 @@ image: ui_display_name: Resize Method standardize_image: ui_display_name: null + expected_impact: 1 width: ui_display_name: null + expected_impact: 2 number: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -382,6 +409,7 @@ number: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 normalization: default_value_reasoning: It could be valuable to observe how the model @@ -412,10 +440,11 @@ sequence: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value lowercase: ui_display_name: null + expected_impact: 2 max_sequence_length: default_value_reasoning: The default value is 256. Every sequence will @@ -448,6 +477,7 @@ sequence: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 most_common: default_value_reasoning: If there are more than 10000 unique categories @@ -460,7 +490,7 @@ sequence: may perform worse when rare tokens appear in the data example_value: - 10000 - expected_impact: 3 + expected_impact: 2 other_information: Specifying a vocab_file overrides this parameter related_parameters: - vocab_file, pretrained_embeddings @@ -478,14 +508,19 @@ sequence: example_value: - 3 ui_display_name: n-gram size + expected_impact: 2 padding: ui_display_name: null + expected_impact: 1 padding_symbol: ui_display_name: null + expected_impact: 1 tokenizer: ui_display_name: null + expected_impact: 3 unknown_symbol: ui_display_name: null + expected_impact: 1 vocab_file: default_value_reasoning: The vocabulary can be parsed automatically from @@ -496,7 +531,7 @@ sequence: that fits your data, or if there are several uncommon or infrequently occurring tokens that we want to guarantee to be a part of the vocabulary, rather than treated as an unknown. - expected_impact: 2 + expected_impact: 0 ui_display_name: Vocab File set: preprocessing: @@ -504,10 +539,11 @@ set: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value lowercase: ui_display_name: null + expected_impact: 2 missing_value_strategy: default_value_reasoning: The default `fill_with_const` replaces missing @@ -559,7 +595,7 @@ text: - missing_value_strategy, fill_value ui_display_name: DOCSTRING ONLY fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value lowercase: default_value_reasoning: @@ -571,7 +607,7 @@ text: words are seen as completely separate entities than lowercase words. example_value: - true - expected_impact: 1 + expected_impact: 2 related_parameters: - vocab_size suggested_values: "TRUE" @@ -612,6 +648,7 @@ text: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 most_common: default_value_reasoning: If there are more than 10000 unique categories @@ -624,7 +661,7 @@ text: may perform worse when rare tokens appear in the data example_value: - 10000 - expected_impact: 3 + expected_impact: 2 other_information: Specifying a vocab_file overrides this parameter related_parameters: - vocab_file, pretrained_embeddings @@ -642,6 +679,7 @@ text: example_value: - 3 ui_display_name: n-gram size + expected_impact: 2 padding: default_value_reasoning: We usually want to add padding to the end of @@ -663,8 +701,10 @@ text: ui_display_name: Padding padding_symbol: ui_display_name: null + expected_impact: 1 pretrained_model_name_or_path: ui_display_name: null + expected_impact: 0 tokenizer: default_value_reasoning: 'The default tokenizer is `space_punct`, an abbreviation @@ -699,6 +739,7 @@ text: ui_display_name: Tokenizer unknown_symbol: ui_display_name: null + expected_impact: 1 vocab_file: default_value_reasoning: The vocabulary can be parsed automatically from @@ -709,7 +750,7 @@ text: that fits your data, or if there are several uncommon or infrequently occurring tokens that we want to guarantee to be a part of the vocabulary, rather than treated as an unknown. - expected_impact: 2 + expected_impact: 0 ui_display_name: Vocab File timeseries: preprocessing: @@ -717,7 +758,7 @@ timeseries: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -733,21 +774,26 @@ timeseries: related_parameters: - fill_value ui_display_name: Missing Value Strategy + expected_impact: 3 padding: ui_display_name: null + expected_impact: 1 padding_value: ui_display_name: null + expected_impact: 1 timeseries_length_limit: ui_display_name: null + expected_impact: 2 tokenizer: ui_display_name: null + expected_impact: 3 vector: preprocessing: computed_fill_value: internal_only: true ui_display_name: null fill_value: - expected_impact: 3 + expected_impact: 1 ui_display_name: Fill Value missing_value_strategy: default_value_reasoning: @@ -766,3 +812,4 @@ vector: ui_display_name: Missing Value Strategy vector_size: ui_display_name: null + expected_impact: 3 From 178a9385900ef19c9a5838e699dde20dfd6e719a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 03:56:39 +0000 Subject: [PATCH 05/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../schema/metadata/configs/optimizers.yaml | 2 +- ludwig/schema/optimizers.py | 122 ++++++------------ 2 files changed, 41 insertions(+), 83 deletions(-) diff --git a/ludwig/schema/metadata/configs/optimizers.yaml b/ludwig/schema/metadata/configs/optimizers.yaml index f28b246bb8d..bd68f3c847b 100644 --- a/ludwig/schema/metadata/configs/optimizers.yaml +++ b/ludwig/schema/metadata/configs/optimizers.yaml @@ -56,4 +56,4 @@ alpha: eps: expected_impact: 1 centered: - expected_impact: 1 \ No newline at end of file + expected_impact: 1 diff --git a/ludwig/schema/optimizers.py b/ludwig/schema/optimizers.py index 25b98f3ef23..5f97324be4e 100644 --- a/ludwig/schema/optimizers.py +++ b/ludwig/schema/optimizers.py @@ -66,24 +66,16 @@ class SGDOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD : momentum: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Momentum factor.", - parameter_metadata=OPTIMIZER_METADATA["momentum"] + default=0.0, description="Momentum factor.", parameter_metadata=OPTIMIZER_METADATA["momentum"] ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) dampening: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Dampening for momentum.", - parameter_metadata=OPTIMIZER_METADATA["dampening"] + default=0.0, description="Dampening for momentum.", parameter_metadata=OPTIMIZER_METADATA["dampening"] ) nesterov: bool = schema_utils.Boolean( - default=False, - description="Enables Nesterov momentum.", - parameter_metadata=OPTIMIZER_METADATA["nesterov"] + default=False, description="Enables Nesterov momentum.", parameter_metadata=OPTIMIZER_METADATA["nesterov"] ) @@ -104,39 +96,37 @@ class LBFGSOptimizerConfig(BaseOptimizerConfig): max_iter: int = schema_utils.Integer( default=20, description="Maximum number of iterations per optimization step.", - parameter_metadata=OPTIMIZER_METADATA["max_iter"] + parameter_metadata=OPTIMIZER_METADATA["max_iter"], ) max_eval: int = schema_utils.Integer( default=None, allow_none=True, description="Maximum number of function evaluations per optimization step. Default: `max_iter` * 1.25.", - parameter_metadata=OPTIMIZER_METADATA["max_eval"] + parameter_metadata=OPTIMIZER_METADATA["max_eval"], ) tolerance_grad: float = schema_utils.NonNegativeFloat( default=1e-07, description="Termination tolerance on first order optimality.", - parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"] + parameter_metadata=OPTIMIZER_METADATA["tolerance_grad"], ) tolerance_change: float = schema_utils.NonNegativeFloat( default=1e-09, description="Termination tolerance on function value/parameter changes.", - parameter_metadata=OPTIMIZER_METADATA["tolerance_change"] + parameter_metadata=OPTIMIZER_METADATA["tolerance_change"], ) history_size: int = schema_utils.Integer( - default=100, - description="Update history size.", - parameter_metadata=OPTIMIZER_METADATA["history_size"] + default=100, description="Update history size.", parameter_metadata=OPTIMIZER_METADATA["history_size"] ) line_search_fn: str = schema_utils.StringOptions( ["strong_wolfe"], default=None, description="Line search function to use.", - parameter_metadata=OPTIMIZER_METADATA["line_search_fn"] + parameter_metadata=OPTIMIZER_METADATA["line_search_fn"], ) @@ -157,26 +147,24 @@ class AdamOptimizerConfig(BaseOptimizerConfig): betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square.", - parameter_metadata=OPTIMIZER_METADATA["betas"] + parameter_metadata=OPTIMIZER_METADATA["betas"], ) eps: float = schema_utils.NonNegativeFloat( default=1e-08, description="Term added to the denominator to improve numerical stability.", - parameter_metadata=OPTIMIZER_METADATA["eps"] + parameter_metadata=OPTIMIZER_METADATA["eps"], ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay (L2 penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay (L2 penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) amsgrad: bool = schema_utils.Boolean( default=False, description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam " - "and Beyond'.", - parameter_metadata=OPTIMIZER_METADATA["amsgrad"] + "and Beyond'.", + parameter_metadata=OPTIMIZER_METADATA["amsgrad"], ) @@ -197,26 +185,24 @@ class AdamWOptimizerConfig(BaseOptimizerConfig): betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square.", - parameter_metadata=OPTIMIZER_METADATA["betas"] + parameter_metadata=OPTIMIZER_METADATA["betas"], ) eps: float = schema_utils.NonNegativeFloat( default=1e-08, description="Term added to the denominator to improve numerical stability.", - parameter_metadata=OPTIMIZER_METADATA["eps"] + parameter_metadata=OPTIMIZER_METADATA["eps"], ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) amsgrad: bool = schema_utils.Boolean( default=False, description="Whether to use the AMSGrad variant of this algorithm from the paper 'On the Convergence of Adam " - "and Beyond'. ", - parameter_metadata=OPTIMIZER_METADATA["amsgrad"] + "and Beyond'. ", + parameter_metadata=OPTIMIZER_METADATA["amsgrad"], ) @@ -239,19 +225,17 @@ class AdadeltaOptimizerConfig(BaseOptimizerConfig): min=0, max=1, description="Coefficient used for computing a running average of squared gradients.", - parameter_metadata=OPTIMIZER_METADATA["rho"] + parameter_metadata=OPTIMIZER_METADATA["rho"], ) eps: float = schema_utils.NonNegativeFloat( default=1e-06, description="Term added to the denominator to improve numerical stability.", - parameter_metadata=OPTIMIZER_METADATA["eps"] + parameter_metadata=OPTIMIZER_METADATA["eps"], ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) @@ -271,27 +255,21 @@ class AdagradOptimizerConfig(BaseOptimizerConfig): # Defaults taken from https://pytorch.org/docs/stable/generated/torch.optim.Adagrad.html#torch.optim.Adagrad : initial_accumulator_value: float = schema_utils.NonNegativeFloat( - default=0, - description="", - parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] + default=0, description="", parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] ) lr_decay: float = schema_utils.FloatRange( - default=0, - description="Learning rate decay.", - parameter_metadata=OPTIMIZER_METADATA["lr_decay"] + default=0, description="Learning rate decay.", parameter_metadata=OPTIMIZER_METADATA["lr_decay"] ) weight_decay: float = schema_utils.FloatRange( - default=0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) eps: float = schema_utils.FloatRange( default=1e-10, description="Term added to the denominator to improve numerical stability.", - parameter_metadata=OPTIMIZER_METADATA["eps"] + parameter_metadata=OPTIMIZER_METADATA["eps"], ) @@ -312,19 +290,17 @@ class AdamaxOptimizerConfig(BaseOptimizerConfig): betas: Tuple[float, float] = schema_utils.FloatRangeTupleDataclassField( default=(0.9, 0.999), description="Coefficients used for computing running averages of gradient and its square.", - parameter_metadata=OPTIMIZER_METADATA["betas"] + parameter_metadata=OPTIMIZER_METADATA["betas"], ) eps: float = schema_utils.NonNegativeFloat( default=1e-08, description="Term added to the denominator to improve numerical stability.", - parameter_metadata=OPTIMIZER_METADATA["eps"] + parameter_metadata=OPTIMIZER_METADATA["eps"], ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) @@ -337,24 +313,19 @@ class FtrlOptimizerConfig(BaseOptimizerConfig): type: str = schema_utils.ProtectedString("ftrl") learning_rate_power: float = schema_utils.FloatRange( - default=-0.5, - max=0, - parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"] + default=-0.5, max=0, parameter_metadata=OPTIMIZER_METADATA["learning_rate_power"] ) initial_accumulator_value: float = schema_utils.NonNegativeFloat( - default=0.1, - parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] + default=0.1, parameter_metadata=OPTIMIZER_METADATA["initial_accumulator_value"] ) l1_regularization_strength: float = schema_utils.NonNegativeFloat( - default=0.0, - parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"] + default=0.0, parameter_metadata=OPTIMIZER_METADATA["l1_regularization_strength"] ) l2_regularization_strength: float = schema_utils.NonNegativeFloat( - default=0.0, - parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"] + default=0.0, parameter_metadata=OPTIMIZER_METADATA["l2_regularization_strength"] ) @@ -382,15 +353,11 @@ class NadamOptimizerConfig(BaseOptimizerConfig): ) weight_decay: float = schema_utils.NonNegativeFloat( - default=0.0, - description="Weight decay ($L2$ penalty).", - parameter_metadata=OPTIMIZER_METADATA["weight_decay"] + default=0.0, description="Weight decay ($L2$ penalty).", parameter_metadata=OPTIMIZER_METADATA["weight_decay"] ) momentum_decay: float = schema_utils.NonNegativeFloat( - default=4e-3, - description="Momentum decay.", - parameter_metadata=OPTIMIZER_METADATA["momentum_decay"] + default=4e-3, description="Momentum decay.", parameter_metadata=OPTIMIZER_METADATA["momentum_decay"] ) @@ -429,7 +396,7 @@ class RMSPropOptimizerConfig(BaseOptimizerConfig): centered: bool = schema_utils.Boolean( default=False, description="If True, computes the centered RMSProp, and the gradient is normalized by an estimation of its " - "variance.", + "variance.", parameter_metadata=OPTIMIZER_METADATA["centered"], ) @@ -535,24 +502,15 @@ class GradientClippingConfig(schema_utils.BaseMarshmallowConfig): """Dataclass that holds gradient clipping parameters.""" clipglobalnorm: Optional[float] = schema_utils.FloatRange( - default=0.5, - allow_none=True, - description="", - parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] + default=0.5, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) clipnorm: Optional[float] = schema_utils.FloatRange( - default=None, - allow_none=True, - description="", - parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] + default=None, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) clipvalue: Optional[float] = schema_utils.FloatRange( - default=None, - allow_none=True, - description="", - parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] + default=None, allow_none=True, description="", parameter_metadata=OPTIMIZER_METADATA["gradient_clipping"] ) From 3f31acf2147a0857b982f94f22ba547e5d0b41d5 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 23:10:52 -0700 Subject: [PATCH 06/22] Encoder expected impacts --- ludwig/schema/encoders/sequence_encoders.py | 6 +- ludwig/schema/metadata/configs/encoders.yaml | 521 ++++++++++++------- ludwig/schema/metadata/configs/features.yaml | 3 +- 3 files changed, 349 insertions(+), 181 deletions(-) diff --git a/ludwig/schema/encoders/sequence_encoders.py b/ludwig/schema/encoders/sequence_encoders.py index ab2ae41ecba..c244a4bb59e 100644 --- a/ludwig/schema/encoders/sequence_encoders.py +++ b/ludwig/schema/encoders/sequence_encoders.py @@ -168,7 +168,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=None, + default=1, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["ParallelCNN"]["num_conv_layers"], ) @@ -336,7 +336,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=None, + default=1, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["StackedCNN"]["num_conv_layers"], ) @@ -1063,7 +1063,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=None, + default=1, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["StackedCNNRNN"]["num_conv_layers"], ) diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml index c448f67e3bc..f66e61739e3 100644 --- a/ludwig/schema/metadata/configs/encoders.yaml +++ b/ludwig/schema/metadata/configs/encoders.yaml @@ -20,7 +20,7 @@ ALBERT: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -35,6 +35,7 @@ ALBERT: bos_token_id: default_value_reasoning: Default value used in pre-trained HF encoder. ui_display_name: Beginning-of-Sentence Token Id + expected_impact: 1 classifier_dropout_prob: default_value_reasoning: Huggingface default. description_implications: @@ -46,7 +47,7 @@ ALBERT: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -78,7 +79,7 @@ ALBERT: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -90,6 +91,7 @@ ALBERT: eos_token_id: default_value_reasoning: Default value used in pre-trained HF encoder. ui_display_name: End-of-Sentence Token Id + expected_impact: 1 hidden_act: default_value_reasoning: Taken from huggingface. description_implications: @@ -97,7 +99,7 @@ ALBERT: the feed-forward layers of the transformer. example_value: - relu - expected_impact: 2 + expected_impact: 1 literature_references: - "[Hugging face docs for ALBERT config](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertConfig.hidden_act)\n\ \r\n[Relevant StackOverflow discussion](https://ai.stackexchange.com/questions/30341/why-does-a-transformer-not-use-an-activation-function-following-the-multi-head-a)" @@ -118,7 +120,7 @@ ALBERT: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -138,7 +140,7 @@ ALBERT: Increasing the hidden size makes the model larger and slower to train, increases the model's capacity to capture more complexity. It also increases the chance of overfitting. - expected_impact: 2 + expected_impact: 1 suggested_values: 10 - 2048 suggested_values_reasoning: Increasing the hidden size makes sense if the @@ -152,7 +154,7 @@ ALBERT: lead to the outputs of these matrices to vanish or explode example_value: - 0.02 - expected_impact: 3 + expected_impact: 1 other_information: Must be greater than 0 related_parameters: - weights_initializer @@ -163,10 +165,13 @@ ALBERT: ui_display_name: null inner_group_num: ui_display_name: null + expected_impact: 1 intermediate_size: ui_display_name: null + expected_impact: 1 layer_norm_eps: ui_display_name: null + expected_impact: 1 max_position_embeddings: default_value_reasoning: Taken from huggingface. description_implications: @@ -179,7 +184,7 @@ ALBERT: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality." - expected_impact: 2 + expected_impact: 1 suggested_values: 512 suggested_values_reasoning: Out of the box value based on published literature. @@ -190,18 +195,23 @@ ALBERT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_attention_heads: ui_display_name: null + expected_impact: 1 num_hidden_groups: ui_display_name: null + expected_impact: 1 num_hidden_layers: ui_display_name: null + expected_impact: 1 pad_token_id: ui_display_name: null + expected_impact: 1 position_embedding_type: ui_display_name: null + expected_impact: 1 pretrained_kwargs: default_value_reasoning: These arguments typically don't need to be specified. expected_impact: 1 @@ -241,6 +251,7 @@ ALBERT: ui_display_name: Pretrained model reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -248,7 +259,7 @@ ALBERT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -259,12 +270,14 @@ ALBERT: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null type_vocab_size: ui_display_name: null + expected_impact: 1 use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -294,16 +307,19 @@ AutoTransformer: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null pretrained_kwargs: ui_display_name: null + expected_impact: 1 pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null vocab: default_value_reasoning: @@ -471,7 +487,7 @@ BERT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_attention_heads: ui_display_name: null @@ -485,8 +501,10 @@ BERT: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -494,7 +512,7 @@ BERT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -505,12 +523,13 @@ BERT: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null type_vocab_size: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -544,7 +563,7 @@ BagEmbedWeighted: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -620,7 +639,7 @@ BagEmbedWeighted: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -724,7 +743,7 @@ BagEmbedWeighted: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -801,7 +820,7 @@ BagEmbedWeighted: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -824,7 +843,7 @@ BagEmbedWeighted: model may have a head start in its representation of various input entities. example_value: - ~/Downloads/glove.6B.100d.txt - expected_impact: 2 + expected_impact: 0 related_parameters: - embedding_size, embeddings_trainable ui_display_name: Pretrained embeddings path @@ -895,7 +914,7 @@ BagEmbedWeighted: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -947,7 +966,7 @@ CTRL: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_ctx: ui_display_name: null @@ -963,8 +982,10 @@ CTRL: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 resid_pdrop: ui_display_name: null saved_weights_in_checkpoint: @@ -974,7 +995,7 @@ CTRL: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -985,10 +1006,11 @@ CTRL: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -1152,7 +1174,7 @@ CamemBERT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_attention_heads: ui_display_name: null @@ -1166,8 +1188,10 @@ CamemBERT: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -1175,7 +1199,7 @@ CamemBERT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -1186,12 +1210,13 @@ CamemBERT: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null type_vocab_size: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -1274,7 +1299,7 @@ CategoricalEmbed: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -1305,8 +1330,10 @@ CategoricalEmbed: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 vocab: default_value_reasoning: Computed and passed along internally according to @@ -1315,7 +1342,6 @@ CategoricalEmbed: - a - b - c - expected_impact: 2 internal_only: true ui_display_name: Not Displayed CategoricalSparse: @@ -1386,7 +1412,7 @@ CategoricalSparse: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -1417,8 +1443,10 @@ CategoricalSparse: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 vocab: default_value_reasoning: Computed and passed along internally according to @@ -1447,7 +1475,7 @@ DateEmbed: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -1523,7 +1551,7 @@ DateEmbed: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -1598,7 +1626,7 @@ DateEmbed: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -1675,7 +1703,7 @@ DateEmbed: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -1730,7 +1758,7 @@ DateEmbed: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -1762,7 +1790,7 @@ DateWave: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -1864,7 +1892,7 @@ DateWave: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -1941,7 +1969,7 @@ DateWave: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -1996,7 +2024,7 @@ DateWave: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -2026,7 +2054,7 @@ DenseEncoder: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -2083,6 +2111,7 @@ DenseEncoder: performance. ui_display_name: Dropout input_size: + internal_only: true other_information: Internal Only related_parameters: - "No" @@ -2101,7 +2130,7 @@ DenseEncoder: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -2159,7 +2188,7 @@ DenseEncoder: due to overfitting." example_value: - 1 - expected_impact: 1 + expected_impact: 3 other_information: If you have multiple input features, varying the number of layers in the combiner or output feature decoder will have more impact. @@ -2180,7 +2209,7 @@ DenseEncoder: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -2211,7 +2240,7 @@ DenseEncoder: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -2243,7 +2272,7 @@ DistilBERT: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -2340,7 +2369,7 @@ DistilBERT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_heads: ui_display_name: null @@ -2350,6 +2379,7 @@ DistilBERT: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 qa_dropout: default_value_reasoning: Huggingface default. description_implications: @@ -2375,6 +2405,7 @@ DistilBERT: ui_display_name: qa_dropout reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -2382,7 +2413,7 @@ DistilBERT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -2422,10 +2453,11 @@ DistilBERT: sinusoidal_pos_embds: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -2518,7 +2550,7 @@ ELECTRA: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -2624,7 +2656,7 @@ ELECTRA: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_attention_heads: ui_display_name: null @@ -2636,8 +2668,10 @@ ELECTRA: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -2645,7 +2679,7 @@ ELECTRA: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -2656,12 +2690,13 @@ ELECTRA: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null type_vocab_size: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -2688,6 +2723,7 @@ FlauBERT: - https://arxiv.org/abs/1912.05372 asm: ui_display_name: null + expected_impact: 1 attention_dropout: default_value_reasoning: Huggingface default. description_implications: @@ -2699,7 +2735,7 @@ FlauBERT: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -2713,8 +2749,10 @@ FlauBERT: ui_display_name: attention_dropout bos_index: ui_display_name: null + expected_impact: 1 causal: ui_display_name: null + expected_impact: 1 dropout: default_value_reasoning: Huggingface default. description_implications: @@ -2740,27 +2778,38 @@ FlauBERT: ui_display_name: dropout emb_dim: ui_display_name: null + expected_impact: 1 embed_init_std: ui_display_name: null + expected_impact: 1 eos_index: ui_display_name: null + expected_impact: 1 gelu_activation: ui_display_name: null + expected_impact: 1 init_std: ui_display_name: null + expected_impact: 1 is_encoder: ui_display_name: null + expected_impact: 1 lang_id: ui_display_name: null + expected_impact: 1 layer_norm_eps: ui_display_name: null + expected_impact: 1 layerdrop: ui_display_name: null + expected_impact: 1 mask_index: ui_display_name: null + expected_impact: 1 mask_token_id: default_value_reasoning: Default value used in pre-trained HF encoder. ui_display_name: Mask Token ID + expected_impact: 1 max_position_embeddings: default_value_reasoning: Taken from huggingface. description_implications: @@ -2773,7 +2822,7 @@ FlauBERT: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality." - expected_impact: 2 + expected_impact: 1 suggested_values: 512 suggested_values_reasoning: Out of the box value based on published literature. @@ -2784,26 +2833,33 @@ FlauBERT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_head: ui_display_name: null + expected_impact: 1 n_langs: default_value_reasoning: Default value used in pre-trained HF encoder. expected_impact: 1 ui_display_name: Number of Languages n_layer: ui_display_name: null + expected_impact: 1 pad_index: ui_display_name: null + expected_impact: 1 pre_norm: ui_display_name: null + expected_impact: 1 pretrained_kwargs: ui_display_name: null + expected_impact: 1 pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -2811,7 +2867,7 @@ FlauBERT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -2823,15 +2879,19 @@ FlauBERT: ui_display_name: null sinusoidal_embeddings: ui_display_name: null + expected_impact: 1 trainable: - expected_impact: 2 ui_display_name: null + expected_impact: 3 unk_index: ui_display_name: null + expected_impact: 1 use_lang_emb: ui_display_name: null + expected_impact: 1 use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -2883,7 +2943,7 @@ GPT2: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_ctx: ui_display_name: null @@ -2901,17 +2961,20 @@ GPT2: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 resid_pdrop: ui_display_name: null scale_attn_weights: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -2963,7 +3026,7 @@ GPT: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_ctx: ui_display_name: null @@ -2979,8 +3042,10 @@ GPT: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 resid_pdrop: ui_display_name: null saved_weights_in_checkpoint: @@ -2990,7 +3055,7 @@ GPT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -3001,10 +3066,11 @@ GPT: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -3013,7 +3079,6 @@ GPT: - a - b - c - expected_impact: 2 internal_only: true ui_display_name: Not Displayed vocab_size: @@ -3037,7 +3102,7 @@ H3Embed: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -3113,7 +3178,7 @@ H3Embed: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -3188,7 +3253,7 @@ H3Embed: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -3265,7 +3330,7 @@ H3Embed: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -3306,6 +3371,7 @@ H3Embed: ui_display_name: Sequence Reducer use_bias: ui_display_name: null + expected_impact: 1 weights_initializer: default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). description_implications: @@ -3321,7 +3387,7 @@ H3Embed: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -3347,6 +3413,7 @@ H3RNN: encoding the path in the tree of all H3 hexes. activation: ui_display_name: null + expected_impact: 1 bias_initializer: default_value_reasoning: It is possible and common to initialize the biases @@ -3358,7 +3425,7 @@ H3RNN: constant value such as 0.01 for all biases to ensure that all ReLU units are activated in the beginning and have some effect on the gradient. However, it's still an open question as to whether this provides consistent improvement. - expected_impact: 1 + expected_impact: 2 literature_references: - https://cs231n.github.io/neural-networks-2/ related_parameters: @@ -3382,7 +3449,7 @@ H3RNN: Setting bidirectional to True may increase the compute and memory requirements of the model, but may also increase model performance on long sequences. - expected_impact: 3 + expected_impact: 0 literature_references: - https://devopedia.org/bidirectional-rnn#:~:text=RNN%20has%20the%20limitation%20that,forward%20and%20reverse%20time%20order. related_parameters: @@ -3404,7 +3471,7 @@ H3RNN: (1) compute costs and (2) catastrophic forgetting (source: https://en.wikipedia.org/wiki/Catastrophic_interference ). RNNs have marginally less compute costs, but are prone to catastrophic forgetting." - expected_impact: 1 + expected_impact: 3 related_parameters: - "bidirectional @@ -3460,7 +3527,7 @@ H3RNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -3514,7 +3581,7 @@ H3RNN: performance for longer sequences or more complex tasks. example_value: - 1 - expected_impact: 1 + expected_impact: 3 other_information: If you have multiple input features, varying the number of layers in the combiner or output feature decoder will have more impact. @@ -3528,7 +3595,7 @@ H3RNN: ui_display_name: Number of Recurrent Layers recurrent_activation: default_value_reasoning: sigmoid' is commonly used - expected_impact: 3 + expected_impact: 1 other_information: I don't think that this parameter is used anywhere in the code base. It's being passed down but not used in the actual RNN forwarding @@ -3549,7 +3616,7 @@ H3RNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 2 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -3563,10 +3630,13 @@ H3RNN: ui_display_name: Recurrent Dropout recurrent_initializer: ui_display_name: null + expected_impact: 1 reduce_output: ui_display_name: null + expected_impact: 1 unit_forget_bias: ui_display_name: null + expected_impact: 1 use_bias: ui_display_name: null weights_initializer: @@ -3584,7 +3654,7 @@ H3RNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -3616,7 +3686,7 @@ H3WeightedSum: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -3692,7 +3762,7 @@ H3WeightedSum: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -3767,7 +3837,7 @@ H3WeightedSum: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -3844,7 +3914,7 @@ H3WeightedSum: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -3860,8 +3930,10 @@ H3WeightedSum: ui_display_name: Output Size should_softmax: ui_display_name: null + expected_impact: 1 use_bias: ui_display_name: null + expected_impact: 1 weights_initializer: default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). description_implications: @@ -3877,7 +3949,7 @@ H3WeightedSum: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -3907,7 +3979,7 @@ Longformer: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_tokens: ui_display_name: null @@ -3915,8 +3987,10 @@ Longformer: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -3924,7 +3998,7 @@ Longformer: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -3937,10 +4011,11 @@ Longformer: sep_token_id: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -4008,7 +4083,7 @@ MLPMixer: performance for larger images or more complex image tasks. example_value: - 8 - expected_impact: 1 + expected_impact: 3 literature_references: - "MLP-Mixer: An all-MLP Architecture for Vision @@ -4102,7 +4177,7 @@ MT5: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_decoder_layers: ui_display_name: null @@ -4121,7 +4196,7 @@ MT5: from the pre-trained model." example_value: - 8 - expected_impact: 1 + expected_impact: 3 related_parameters: - pretrained_model_or_path suggested_values: 1 - 12 @@ -4136,8 +4211,10 @@ MT5: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 relative_attention_num_buckets: ui_display_name: null saved_weights_in_checkpoint: @@ -4147,7 +4224,7 @@ MT5: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -4187,12 +4264,13 @@ MT5: tokenizer_class: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_cache: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -4228,7 +4306,7 @@ ParallelCNN: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -4320,7 +4398,7 @@ ParallelCNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -4351,6 +4429,7 @@ ParallelCNN: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_layers: default_value_reasoning: By default the stack is built by using num_fc_layers, @@ -4387,11 +4466,12 @@ ParallelCNN: ui_display_name: Fully Connected Layers filter_size: ui_display_name: null + expected_impact: 2 max_sequence_length: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -4405,7 +4485,7 @@ ParallelCNN: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -4453,7 +4533,7 @@ ParallelCNN: achieve better performance when a large amount of data is provided, but also makes the model more computationally expensive and potentially more prone to overfitting. - expected_impact: 2 + expected_impact: 3 related_parameters: - conv_layers ui_display_name: Number of Convolutional Layers @@ -4495,7 +4575,7 @@ ParallelCNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -4511,14 +4591,19 @@ ParallelCNN: ui_display_name: Output Size pool_function: ui_display_name: Pooling function + expected_impact: 1 pool_size: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed @@ -4574,7 +4659,7 @@ ParallelCNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -4597,6 +4682,7 @@ PassthroughEncoder: placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is the batch size. input_size: + internal_only: true other_information: Internal Only related_parameters: - "No" @@ -4610,6 +4696,7 @@ BinaryPassthroughEncoder: placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is the batch size. input_size: + internal_only: true other_information: Internal Only related_parameters: - "No" @@ -4623,6 +4710,7 @@ CategoricalPassthroughEncoder: placeholders as outputs. Inputs are of size `b` while outputs are of size `b x 1` where `b` is the batch size. input_size: + internal_only: true other_information: Internal Only related_parameters: - "No" @@ -4644,7 +4732,7 @@ ResNet: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -4742,8 +4830,10 @@ ResNet: ui_display_name: Fully Connected Layers first_pool_kernel_size: ui_display_name: null + expected_impact: 1 first_pool_stride: ui_display_name: null + expected_impact: 1 height: internal_only: true ui_display_name: null @@ -4761,7 +4851,7 @@ ResNet: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -4842,7 +4932,7 @@ ResNet: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -4860,6 +4950,7 @@ ResNet: ui_display_name: null use_bias: ui_display_name: null + expected_impact: 1 weights_initializer: default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). description_implications: @@ -4875,7 +4966,7 @@ ResNet: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -4916,7 +5007,7 @@ RoBERTa: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null pad_token_id: ui_display_name: null @@ -4924,8 +5015,10 @@ RoBERTa: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -4933,7 +5026,7 @@ RoBERTa: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -4944,10 +5037,11 @@ RoBERTa: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -5015,7 +5109,7 @@ SequenceEmbed: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -5046,18 +5140,22 @@ SequenceEmbed: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 max_sequence_length: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null pretrained_embeddings: ui_display_name: null + expected_impact: 0 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 vocab: default_value_reasoning: Computed and passed along internally according to @@ -5084,7 +5182,7 @@ SequenceEmbed: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -5125,10 +5223,11 @@ SequencePassthrough: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null reduce_output: ui_display_name: null + expected_impact: 1 SetSparseEncoder: type: short_description: @@ -5148,7 +5247,7 @@ SetSparseEncoder: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -5224,7 +5323,7 @@ SetSparseEncoder: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -5255,6 +5354,7 @@ SetSparseEncoder: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_layers: default_value_reasoning: By default the stack is built by using num_fc_layers, @@ -5301,7 +5401,7 @@ SetSparseEncoder: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -5378,7 +5478,7 @@ SetSparseEncoder: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -5394,8 +5494,10 @@ SetSparseEncoder: ui_display_name: Output Size pretrained_embeddings: ui_display_name: null + expected_impact: 0 representation: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -5448,7 +5550,7 @@ SetSparseEncoder: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -5590,7 +5692,7 @@ Stacked2DCNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -5695,7 +5797,7 @@ Stacked2DCNN: achieve better performance when a large amount of data is provided, but also makes the model more computationally expensive and potentially more prone to overfitting. - expected_impact: 2 + expected_impact: 3 related_parameters: - conv_layers ui_display_name: Number of Convolutional Layers @@ -5738,7 +5840,7 @@ Stacked2DCNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -5846,7 +5948,7 @@ StackedCNN: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -5957,7 +6059,7 @@ StackedCNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -5988,6 +6090,7 @@ StackedCNN: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_layers: default_value_reasoning: By default the stack is built by using num_fc_layers, @@ -6024,11 +6127,12 @@ StackedCNN: ui_display_name: Fully Connected Layers filter_size: ui_display_name: null + expected_impact: 2 max_sequence_length: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -6042,7 +6146,7 @@ StackedCNN: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -6090,7 +6194,7 @@ StackedCNN: achieve better performance when a large amount of data is provided, but also makes the model more computationally expensive and potentially more prone to overfitting. - expected_impact: 2 + expected_impact: 3 related_parameters: - conv_layers ui_display_name: Number of Convolutional Layers @@ -6132,7 +6236,7 @@ StackedCNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -6150,18 +6254,25 @@ StackedCNN: ui_display_name: null pool_function: ui_display_name: null + expected_impact: 1 pool_padding: ui_display_name: null + expected_impact: 1 pool_size: ui_display_name: null + expected_impact: 1 pool_strides: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed @@ -6202,6 +6313,7 @@ StackedCNN: ui_display_name: Stride use_bias: ui_display_name: null + expected_impact: 1 vocab: default_value_reasoning: Computed and passed along internally according to @@ -6210,7 +6322,7 @@ StackedCNN: - a - b - c - expected_impact: 2 + expected_impact: 1 internal_only: true ui_display_name: Not Displayed weights_initializer: @@ -6228,7 +6340,7 @@ StackedCNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -6254,6 +6366,7 @@ StackedCNNRNN: last output, but can perform other reduce functions. activation: ui_display_name: null + expected_impact: 2 bias_initializer: default_value_reasoning: It is possible and common to initialize the biases @@ -6283,10 +6396,13 @@ StackedCNNRNN: ui_display_name: Bias Initializer bidirectional: ui_display_name: null + expected_impact: 0 cell_type: ui_display_name: null + expected_impact: 3 conv_activation: ui_display_name: null + expected_impact: 1 conv_dropout: default_value_reasoning: Dropout can cause training to become less stable. @@ -6406,7 +6522,7 @@ StackedCNNRNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -6437,6 +6553,7 @@ StackedCNNRNN: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_activation: default_value_reasoning: The Rectified Linear Units (ReLU) function is the @@ -6472,7 +6589,7 @@ StackedCNNRNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -6526,11 +6643,12 @@ StackedCNNRNN: ui_display_name: Fully Connected Layers filter_size: ui_display_name: null + expected_impact: 2 max_sequence_length: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -6544,7 +6662,7 @@ StackedCNNRNN: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -6592,7 +6710,7 @@ StackedCNNRNN: achieve better performance when a large amount of data is provided, but also makes the model more computationally expensive and potentially more prone to overfitting. - expected_impact: 2 + expected_impact: 3 related_parameters: - conv_layers ui_display_name: Number of Convolutional Layers @@ -6636,7 +6754,7 @@ StackedCNNRNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -6654,17 +6772,22 @@ StackedCNNRNN: ui_display_name: null pool_function: ui_display_name: null + expected_impact: 1 pool_padding: ui_display_name: null + expected_impact: 1 pool_size: ui_display_name: null + expected_impact: 1 pool_strides: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 recurrent_activation: default_value_reasoning: sigmoid' is commonly used - expected_impact: 3 + expected_impact: 1 other_information: I don't think that this parameter is used anywhere in the code base. It's being passed down but not used in the actual RNN forwarding @@ -6685,7 +6808,7 @@ StackedCNNRNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 2 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -6705,15 +6828,19 @@ StackedCNNRNN: ui_display_name: Recurrent Dropout recurrent_initializer: ui_display_name: null + expected_impact: 1 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed state_size: ui_display_name: null + expected_impact: 3 strides: default_value_reasoning: In general, it makes sense to have a smaller stride @@ -6751,6 +6878,7 @@ StackedCNNRNN: ui_display_name: Stride unit_forget_bias: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -6803,7 +6931,7 @@ StackedCNNRNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -6839,7 +6967,7 @@ StackedParallelCNN: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -6915,7 +7043,7 @@ StackedParallelCNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -6946,6 +7074,7 @@ StackedParallelCNN: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_layers: default_value_reasoning: By default the stack is built by using num_fc_layers, @@ -6982,11 +7111,12 @@ StackedParallelCNN: ui_display_name: Fully Connected Layers filter_size: ui_display_name: null + expected_impact: 2 max_sequence_length: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -7000,7 +7130,7 @@ StackedParallelCNN: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -7089,7 +7219,7 @@ StackedParallelCNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -7105,14 +7235,19 @@ StackedParallelCNN: ui_display_name: Output Size pool_function: ui_display_name: null + expected_impact: 1 pool_size: ui_display_name: null + expected_impact: 1 pretrained_embeddings: ui_display_name: null + expected_impact: 0 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed @@ -7170,7 +7305,7 @@ StackedParallelCNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -7195,6 +7330,7 @@ StackedRNN: operation that by default only returns the last output, but can perform other reduce functions. activation: ui_display_name: null + expected_impact: 2 bias_initializer: default_value_reasoning: It is possible and common to initialize the biases @@ -7224,8 +7360,10 @@ StackedRNN: ui_display_name: Bias Initializer bidirectional: ui_display_name: null + expected_impact: 0 cell_type: ui_display_name: null + expected_impact: 3 dropout: default_value_reasoning: Dropout can cause training to become less stable. @@ -7276,7 +7414,7 @@ StackedRNN: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -7307,6 +7445,7 @@ StackedRNN: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_activation: default_value_reasoning: The Rectified Linear Units (ReLU) function is the @@ -7342,7 +7481,7 @@ StackedRNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -7392,7 +7531,7 @@ StackedRNN: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -7406,7 +7545,7 @@ StackedRNN: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -7483,7 +7622,7 @@ StackedRNN: performance for longer sequences or more complex tasks. example_value: - 1 - expected_impact: 1 + expected_impact: 3 suggested_values: 1-3 suggested_values_reasoning: Increasing the number of layers may improve encoder @@ -7499,7 +7638,7 @@ StackedRNN: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -7515,9 +7654,10 @@ StackedRNN: ui_display_name: Output Size pretrained_embeddings: ui_display_name: null + expected_impact: 0 recurrent_activation: default_value_reasoning: sigmoid' is commonly used - expected_impact: 3 + expected_impact: 1 other_information: I don't think that this parameter is used anywhere in the code base. It's being passed down but not used in the actual RNN forwarding @@ -7538,7 +7678,7 @@ StackedRNN: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 2 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -7556,17 +7696,22 @@ StackedRNN: ui_display_name: Recurrent Dropout recurrent_initializer: ui_display_name: null + expected_impact: 1 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed state_size: ui_display_name: null + expected_impact: 3 unit_forget_bias: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -7619,7 +7764,7 @@ StackedRNN: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -7713,7 +7858,7 @@ StackedTransformer: Increasing the embedding size may cause the model to train more slowly, but the higher dimensionality can also improve overall quality.' - expected_impact: 2 + expected_impact: 3 literature_references: - https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture suggested_values: 1.6 * sqrt(vocab_size) @@ -7744,6 +7889,7 @@ StackedTransformer: ui_display_name: Embeddings on CPU embeddings_trainable: ui_display_name: null + expected_impact: 1 fc_activation: default_value_reasoning: The Rectified Linear Units (ReLU) function is the @@ -7779,7 +7925,7 @@ StackedTransformer: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html related_parameters: @@ -7844,7 +7990,7 @@ StackedTransformer: Sets the maximum sequence length of the expected inputs, so input/output shapes and the positional embedding matrix are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null norm: default_value_reasoning: @@ -7858,7 +8004,7 @@ StackedTransformer: rate. example_value: - batch - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/batch-normalization-for-training-of-deep-neural-networks/ related_parameters: @@ -7940,7 +8086,7 @@ StackedTransformer: while providing diminishing returns of model performance." example_value: - 1 - expected_impact: 1 + expected_impact: 3 suggested_values: 1 - 12 suggested_values_reasoning: Increasing the number of layers may improve encoder @@ -7956,7 +8102,7 @@ StackedTransformer: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -7972,10 +8118,13 @@ StackedTransformer: ui_display_name: Output Size pretrained_embeddings: ui_display_name: null + expected_impact: 0 reduce_output: ui_display_name: null + expected_impact: 1 representation: ui_display_name: null + expected_impact: 1 should_embed: internal_only: true ui_display_name: Not displayed @@ -8054,7 +8203,7 @@ StackedTransformer: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster. @@ -8118,7 +8267,7 @@ T5: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null num_decoder_layers: ui_display_name: null @@ -8137,7 +8286,7 @@ T5: from the pre-trained model." example_value: - 6 - expected_impact: 1 + expected_impact: 3 related_parameters: - pretrained_model_or_path suggested_values: 1 - 12 @@ -8150,8 +8299,10 @@ T5: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 relative_attention_num_buckets: ui_display_name: null saved_weights_in_checkpoint: @@ -8161,7 +8312,7 @@ T5: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8172,10 +8323,11 @@ T5: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -8264,7 +8416,7 @@ TransformerXL: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null mem_len: ui_display_name: null @@ -8278,12 +8430,14 @@ TransformerXL: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 proj_init_std: ui_display_name: null proj_share_all_but_first: ui_display_name: null reduce_output: ui_display_name: null + expected_impact: 1 same_length: ui_display_name: null sample_softmax: @@ -8295,7 +8449,7 @@ TransformerXL: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8306,12 +8460,13 @@ TransformerXL: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null untie_r: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -8341,7 +8496,7 @@ TVBaseEncoder: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8358,7 +8513,7 @@ TVBaseEncoder: and flexibility. If False, less weights are subject to change and the model will therefore train faster. However, the representations output by this component are fixed for each input. - expected_impact: 2 + expected_impact: 3 literature_references: - "https://www.ibm.com/cloud/learn/overfitting @@ -8382,7 +8537,7 @@ TVBaseEncoder: Pretrained models have typically already learned features that are difficult to learn from scratch. They are particularly beneficial when training on small amounts of data. - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/transfer-learning-for-deep-learning/ related_parameters: @@ -8662,7 +8817,7 @@ ViT: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8679,7 +8834,7 @@ ViT: and flexibility. If False, less weights are subject to change and the model will therefore train faster. However, the representations output by this component are fixed for each input. - expected_impact: 2 + expected_impact: 3 literature_references: - "https://www.ibm.com/cloud/learn/overfitting @@ -8703,7 +8858,7 @@ ViT: Pretrained models have typically already learned features that are difficult to learn from scratch. They are particularly beneficial when training on small amounts of data. - expected_impact: 3 + expected_impact: 2 literature_references: - https://machinelearningmastery.com/transfer-learning-for-deep-learning/ related_parameters: @@ -8790,6 +8945,7 @@ XLM: ui_display_name: null gelu_activation: ui_display_name: null + expected_impact: 1 init_std: ui_display_name: null is_encoder: @@ -8826,7 +8982,7 @@ XLM: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null n_heads: ui_display_name: null @@ -8844,8 +9000,10 @@ XLM: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -8853,7 +9011,7 @@ XLM: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8868,7 +9026,7 @@ XLM: start_n_top: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null unk_index: ui_display_name: null @@ -8876,6 +9034,7 @@ XLM: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -8911,7 +9070,7 @@ XLMRoBERTa: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null pad_token_id: ui_display_name: null @@ -8919,8 +9078,10 @@ XLMRoBERTa: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 saved_weights_in_checkpoint: default_value_reasoning: The weights of the encoder are not necessarily saved @@ -8928,7 +9089,7 @@ XLMRoBERTa: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -8939,10 +9100,11 @@ XLMRoBERTa: 2. the user doesn't have a lot of storage. ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to @@ -9010,6 +9172,7 @@ XLNet: ui_display_name: End-of-Sequence Token Id ff_activation: ui_display_name: null + expected_impact: 1 initializer_range: description_implications: There is an ideal value for this variable that doesn't @@ -9031,7 +9194,7 @@ XLNet: default_value_reasoning: Sets the maximum sequence length of the expected inputs, so input/output shapes are computed accurately. - expected_impact: 1 + internal_only: true ui_display_name: null mem_len: ui_display_name: null @@ -9045,8 +9208,10 @@ XLNet: ui_display_name: null pretrained_model_name_or_path: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 reuse_len: ui_display_name: null same_length: @@ -9058,7 +9223,7 @@ XLNet: description_implications: The memory footprint for some of these encoders can be large. - expected_impact: 1 + internal_only: true related_parameters: - skip_save_model suggested_values: @@ -9073,6 +9238,7 @@ XLNet: summary_activation: default_value_reasoning: Default value used in pre-trained HF encoder. ui_display_name: Summary Activation Function + expected_impact: 1 summary_last_dropout: default_value_reasoning: Huggingface default. description_implications: @@ -9101,7 +9267,7 @@ XLNet: summary_use_proj: ui_display_name: null trainable: - expected_impact: 2 + expected_impact: 3 ui_display_name: null untie_r: ui_display_name: null @@ -9111,6 +9277,7 @@ XLNet: ui_display_name: null use_pretrained: ui_display_name: null + expected_impact: 2 vocab: default_value_reasoning: Computed and passed along internally according to diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml index db686e09ce4..9674fb97bb2 100644 --- a/ludwig/schema/metadata/configs/features.yaml +++ b/ludwig/schema/metadata/configs/features.yaml @@ -570,7 +570,7 @@ set: may perform worse when rare tokens appear in the data example_value: - 10000 - expected_impact: 3 + expected_impact: 2 other_information: Specifying a vocab_file overrides this parameter related_parameters: - vocab_file, pretrained_embeddings @@ -585,6 +585,7 @@ set: ui_display_name: Most common (vocabulary size) tokenizer: ui_display_name: null + expected_impact: 3 text: preprocessing: computed_fill_value: From 94b34ad3011f2ecdaf8ee7d92e09477dafaee6c0 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Wed, 18 Jan 2023 23:27:54 -0700 Subject: [PATCH 07/22] Combiners and Decoders --- ludwig/schema/metadata/configs/combiners.yaml | 64 ++++++++++++++----- ludwig/schema/metadata/configs/decoders.yaml | 29 ++++++--- ludwig/schema/metadata/configs/encoders.yaml | 9 +++ 3 files changed, 77 insertions(+), 25 deletions(-) diff --git a/ludwig/schema/metadata/configs/combiners.yaml b/ludwig/schema/metadata/configs/combiners.yaml index fc5ffaf86a4..4f9f6307f3c 100644 --- a/ludwig/schema/metadata/configs/combiners.yaml +++ b/ludwig/schema/metadata/configs/combiners.yaml @@ -20,7 +20,7 @@ ComparatorCombiner: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -80,10 +80,12 @@ ComparatorCombiner: literature_references: - https://ludwig.ai/0.6/configuration/combiner/#comparator-combiner ui_display_name: Entity 1 + expected_impact: 3 entity_2: literature_references: - https://ludwig.ai/0.6/configuration/combiner/#comparator-combiner ui_display_name: Entity 2 + expected_impact: 3 fc_layers: default_value_reasoning: By default the stack is built by using num_fc_layers, @@ -145,6 +147,7 @@ ComparatorCombiner: ui_display_name: Normalization Type norm_params: ui_display_name: null + expected_impact: 1 num_fc_layers: default_value_reasoning: The encoder already has learnable parameters.Sometimes @@ -181,7 +184,7 @@ ComparatorCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -235,7 +238,7 @@ ComparatorCombiner: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -269,7 +272,7 @@ ConcatCombiner: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -361,6 +364,7 @@ ConcatCombiner: ui_display_name: Fully Connected Layers flatten_inputs: ui_display_name: null + expected_impact: 1 norm: default_value_reasoning: While batch normalization and layer normalization @@ -388,6 +392,7 @@ ConcatCombiner: ui_display_name: Normalization Type norm_params: ui_display_name: null + expected_impact: 1 num_fc_layers: default_value_reasoning: The encoder already has learnable parameters.Sometimes @@ -424,7 +429,7 @@ ConcatCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -440,6 +445,7 @@ ConcatCombiner: ui_display_name: Output Size residual: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -480,7 +486,7 @@ ConcatCombiner: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -508,7 +514,7 @@ ProjectAggregateCombiner: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -625,6 +631,7 @@ ProjectAggregateCombiner: ui_display_name: Normalization Type norm_params: ui_display_name: null + expected_impact: 1 num_fc_layers: default_value_reasoning: The encoder already has learnable parameters.Sometimes @@ -661,7 +668,7 @@ ProjectAggregateCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -677,8 +684,10 @@ ProjectAggregateCombiner: ui_display_name: Output Size projection_size: ui_display_name: null + expected_impact: 1 residual: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -719,7 +728,7 @@ ProjectAggregateCombiner: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -745,10 +754,13 @@ SequenceCombiner: the outputs for the sequence encoders also apply to the sequence combiner. encoder: ui_display_name: null + expected_impact: 3 main_sequence_feature: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 SequenceConcatCombiner: type: short_description: @@ -774,8 +786,10 @@ SequenceConcatCombiner: the concatenation of the h dimensions of all input features. main_sequence_feature: ui_display_name: null + expected_impact: 3 reduce_output: ui_display_name: null + expected_impact: 1 TabNetCombiner: type: short_description: @@ -831,6 +845,7 @@ TabNetCombiner: where x_hat is the estimated statistic and x_t is the new observed value." suggested_values: 0.01-0.2 ui_display_name: Batch Norm Momentum + expected_impact: 1 bn_virtual_bs: default_value_reasoning: Paper default. description_implications: @@ -846,7 +861,7 @@ TabNetCombiner: of data, so the authors use it only in the generator network. A higher virtual batch size could improve normalization, but it also causes training to run slower since each batch will be sampled multiple times. - expected_impact: 2 + expected_impact: 1 literature_references: - https://paperswithcode.com/method/virtual-batch-normalization ui_display_name: "Ghost Normalization: Virtual batch size" @@ -873,14 +888,19 @@ TabNetCombiner: ui_display_name: Dropout entmax_alpha: ui_display_name: null + expected_impact: 1 entmax_mode: ui_display_name: null + expected_impact: 1 num_shared_blocks: ui_display_name: null + expected_impact: 1 num_steps: ui_display_name: null + expected_impact: 1 num_total_blocks: ui_display_name: null + expected_impact: 1 output_size: default_value_reasoning: A modest value, not too small, not too large. description_implications: @@ -890,7 +910,7 @@ TabNetCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -906,10 +926,13 @@ TabNetCombiner: ui_display_name: Output Size relaxation_factor: ui_display_name: null + expected_impact: 1 size: ui_display_name: null + expected_impact: 3 sparsity: ui_display_name: null + expected_impact: 1 TabTransformerCombiner: type: short_description: @@ -1005,6 +1028,7 @@ TabTransformerCombiner: related_parameters: - hidden_size ui_display_name: Embed Input Feature Name + expected_impact: 3 fc_activation: default_value_reasoning: The Rectified Linear Units (ReLU) function is the @@ -1040,7 +1064,7 @@ TabTransformerCombiner: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html suggested_values: 0.05 - 0.8 @@ -1077,6 +1101,7 @@ TabTransformerCombiner: ui_display_name: Fully Connected Layers fc_residual: ui_display_name: null + expected_impact: 1 hidden_size: default_value_reasoning: Not too big, not too small. description_implications: @@ -1118,6 +1143,7 @@ TabTransformerCombiner: ui_display_name: Normalization Type norm_params: ui_display_name: null + expected_impact: 1 num_fc_layers: default_value_reasoning: The encoder already has learnable parameters.Sometimes @@ -1193,7 +1219,7 @@ TabTransformerCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -1209,6 +1235,7 @@ TabTransformerCombiner: ui_display_name: Output Size reduce_output: ui_display_name: null + expected_impact: 1 transformer_output_size: default_value_reasoning: A modest value, not too small, not too large. description_implications: @@ -1272,7 +1299,7 @@ TabTransformerCombiner: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -1384,7 +1411,7 @@ TransformerCombiner: \ generalization." example_value: - 0.2 - expected_impact: 3 + expected_impact: 1 literature_references: - https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html suggested_values: 0.05 - 0.8 @@ -1462,6 +1489,7 @@ TransformerCombiner: ui_display_name: Normalization Type norm_params: ui_display_name: null + expected_impact: 1 num_fc_layers: default_value_reasoning: The encoder already has learnable parameters.Sometimes @@ -1491,6 +1519,7 @@ TransformerCombiner: ui_display_name: Number of Fully Connected Layers num_heads: ui_display_name: null + expected_impact: 1 num_layers: default_value_reasoning: The ideal number of layers depends on the data. For @@ -1521,7 +1550,7 @@ TransformerCombiner: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -1537,6 +1566,7 @@ TransformerCombiner: ui_display_name: Output Size reduce_output: ui_display_name: null + expected_impact: 1 transformer_output_size: default_value_reasoning: A modest value, not too small, not too large. description_implications: @@ -1600,7 +1630,7 @@ TransformerCombiner: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" diff --git a/ludwig/schema/metadata/configs/decoders.yaml b/ludwig/schema/metadata/configs/decoders.yaml index e17b0effc9a..f2a70b9a68b 100644 --- a/ludwig/schema/metadata/configs/decoders.yaml +++ b/ludwig/schema/metadata/configs/decoders.yaml @@ -34,12 +34,14 @@ Classifier: ui_display_name: Bias Initializer input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed num_classes: other_information: Internal Only ui_display_name: Not Displayed + expected_impact: 3 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -80,7 +82,7 @@ Classifier: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -105,7 +107,7 @@ Projector: Changing the activation functions has an impact on the computational load of the model and might require further hypterparameter tuning - expected_impact: 1 + expected_impact: 2 suggested_values: The default value will work well in the majority of the cases @@ -139,8 +141,10 @@ Projector: ui_display_name: Bias Initializer clip: ui_display_name: null + expected_impact: 1 input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed @@ -153,7 +157,7 @@ Projector: and there's a higher risk of overfitting. If it seems like the model could use even more capacity, consider increasing the number of fully connected layers, or explore other architectures. - expected_impact: 2 + expected_impact: 3 other_information: If num_fc_layers=0 and fc_layers=None, and there are no fully connected layers defined on the module, then this parameter may @@ -207,7 +211,7 @@ Projector: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -228,6 +232,7 @@ Regressor: projection to a single number. activation: ui_display_name: null + expected_impact: 2 bias_initializer: default_value_reasoning: It is possible and common to initialize the biases @@ -257,6 +262,7 @@ Regressor: ui_display_name: Bias Initializer input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed @@ -300,7 +306,7 @@ Regressor: provides a few good options. See this nice discussion from [Weights and Biases](https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster.) for more information. - expected_impact: 3 + expected_impact: 1 literature_references: - "Weights and Biases blog post: https://wandb.ai/site/articles/the-effects-of-weight-initialization-on-neural-nets#:~:text=Studies%20have%20shown%20that%20initializing,net%20train%20better%20and%20faster." - "Xavier et al. paper: http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf" @@ -320,6 +326,7 @@ PassthroughDecoder: The passthrough decoder simply returns the raw output coming from the combiner. input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed @@ -344,13 +351,15 @@ SequenceGeneratorDecoder: during model building. cell_type: ui_display_name: null + expected_impact: 3 input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed max_sequence_length: - expected_impact: 1 + expected_impact: 3 ui_display_name: null num_layers: default_value_reasoning: @@ -361,7 +370,7 @@ SequenceGeneratorDecoder: performance for longer sequences or more complex tasks. example_value: - 1 - expected_impact: 1 + expected_impact: 3 suggested_values: 1-3 suggested_values_reasoning: Increasing the number of layers may improve encoder @@ -410,16 +419,19 @@ SequenceTaggerDecoder: ui_display_name: Attention Embedding Size attention_num_heads: ui_display_name: null + expected_impact: 1 input_size: other_information: Internal Only + internal_only: true related_parameters: - "No" ui_display_name: Not Displayed max_sequence_length: - expected_impact: 1 + expected_impact: 3 ui_display_name: null use_attention: ui_display_name: null + expected_impact: 1 use_bias: default_value_reasoning: "Bias terms may improve model accuracy, and don't @@ -448,3 +460,4 @@ SequenceTaggerDecoder: ui_display_name: Use Bias vocab_size: ui_display_name: Not displayed + internal_only: true diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml index f66e61739e3..a3072c18d46 100644 --- a/ludwig/schema/metadata/configs/encoders.yaml +++ b/ludwig/schema/metadata/configs/encoders.yaml @@ -394,6 +394,7 @@ BERT: ui_display_name: classifier_dropout gradient_checkpointing: ui_display_name: null + expected_impact: 1 hidden_act: default_value_reasoning: Taken from huggingface. description_implications: @@ -462,8 +463,10 @@ BERT: ui_display_name: null intermediate_size: ui_display_name: null + expected_impact: 1 layer_norm_eps: ui_display_name: null + expected_impact: 1 max_position_embeddings: default_value_reasoning: Taken from huggingface. description_implications: @@ -491,14 +494,19 @@ BERT: ui_display_name: null num_attention_heads: ui_display_name: null + expected_impact: 1 num_hidden_layers: ui_display_name: null + expected_impact: 1 pad_token_id: ui_display_name: null + expected_impact: 1 position_embedding_type: ui_display_name: null + expected_impact: 1 pretrained_kwargs: ui_display_name: null + expected_impact: 1 pretrained_model_name_or_path: ui_display_name: null expected_impact: 3 @@ -527,6 +535,7 @@ BERT: ui_display_name: null type_vocab_size: ui_display_name: null + expected_impact: 1 use_pretrained: ui_display_name: null expected_impact: 2 From ff40d63c8046e78ed8888c64b53df9431ade3661 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 00:09:13 -0700 Subject: [PATCH 08/22] Fix parameter metadata not showing up for some params --- ludwig/schema/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py index d95cf096527..7bd66b7cec5 100644 --- a/ludwig/schema/utils.py +++ b/ludwig/schema/utils.py @@ -254,9 +254,12 @@ def String( allow_none=allow_none, load_default=default, dump_default=default, - metadata={"description": description}, + metadata={ + "description": description, + "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None + }, ), - "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None, + # "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None, }, default=default, ) From 195624d896b8ea8d3604cea3889afc18a0efe6f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 07:10:07 +0000 Subject: [PATCH 09/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/schema/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py index 7bd66b7cec5..17238e70ea5 100644 --- a/ludwig/schema/utils.py +++ b/ludwig/schema/utils.py @@ -256,7 +256,7 @@ def String( dump_default=default, metadata={ "description": description, - "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None + "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None, }, ), # "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None, From 1ea2a91e778b64ce5003ef0f468b60992201789a Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 08:35:03 -0700 Subject: [PATCH 10/22] fix --- ludwig/schema/encoders/sequence_encoders.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ludwig/schema/encoders/sequence_encoders.py b/ludwig/schema/encoders/sequence_encoders.py index c244a4bb59e..ab2ae41ecba 100644 --- a/ludwig/schema/encoders/sequence_encoders.py +++ b/ludwig/schema/encoders/sequence_encoders.py @@ -168,7 +168,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=1, + default=None, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["ParallelCNN"]["num_conv_layers"], ) @@ -336,7 +336,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=1, + default=None, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["StackedCNN"]["num_conv_layers"], ) @@ -1063,7 +1063,7 @@ def module_name(): ) num_conv_layers: int = schema_utils.PositiveInteger( - default=1, + default=None, description="Number of parallel convolutional layers to use.", parameter_metadata=ENCODER_METADATA["StackedCNNRNN"]["num_conv_layers"], ) From 4c7f2e11a37cc1c5a54fb8bd5d94af26d4777fdc Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 08:57:28 -0700 Subject: [PATCH 11/22] Fix some missing param metadata --- ludwig/schema/decoders/base.py | 1 + ludwig/schema/encoders/base.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index 9c16ceee4f5..cc2bcf6f751 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -75,6 +75,7 @@ def module_name(cls): input_size: int = schema_utils.PositiveInteger( default=1, description="Size of the input to the decoder.", + parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["input_size"], ) diff --git a/ludwig/schema/encoders/base.py b/ludwig/schema/encoders/base.py index 92595db65bc..1ca3dac0d58 100644 --- a/ludwig/schema/encoders/base.py +++ b/ludwig/schema/encoders/base.py @@ -54,36 +54,43 @@ def module_name(): min=0, max=1, description="Dropout rate.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["dropout"], ) activation: str = schema_utils.StringOptions( ["elu", "leakyRelu", "logSigmoid", "relu", "sigmoid", "tanh", "softmax"], default="relu", description="Activation function to apply to the output.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["activation"], ) input_size: int = schema_utils.PositiveInteger( default=None, description="Size of the input to the dense encoder.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["input_size"], ) output_size: int = schema_utils.PositiveInteger( default=256, description="Size of the output of the feature.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["output_size"], ) use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["use_bias"], ) bias_initializer: Union[str, dict] = schema_utils.InitializerOptions( default="zeros", description="Initializer for the bias vector.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["bias_initializer"], ) weights_initializer: Union[str, dict] = schema_utils.InitializerOptions( description="Initializer for the weight matrix.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["weights_initializer"], ) norm: str = schema_utils.StringOptions( @@ -91,19 +98,23 @@ def module_name(): allow_none=True, default=None, description="Normalization to use in the dense layer.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["norm"], ) norm_params: dict = schema_utils.Dict( default=None, description="Parameters for normalization if norm is either batch or layer.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["norm_params"], ) num_layers: int = schema_utils.PositiveInteger( default=1, description="Number of stacked fully connected layers that the input to the feature passes through.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["num_layers"], ) fc_layers: List[dict] = schema_utils.DictList( default=None, description="List of fully connected layers to use in the encoder.", + parameter_metadata=ENCODER_METADATA["DenseEncoder"]["fc_layers"], ) From f0b294ba75ad92675f7dd5f233d13c357f804645 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 09:02:41 -0700 Subject: [PATCH 12/22] fix --- ludwig/schema/metadata/configs/encoders.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ludwig/schema/metadata/configs/encoders.yaml b/ludwig/schema/metadata/configs/encoders.yaml index a3072c18d46..0a6b587cb2b 100644 --- a/ludwig/schema/metadata/configs/encoders.yaml +++ b/ludwig/schema/metadata/configs/encoders.yaml @@ -2125,8 +2125,9 @@ DenseEncoder: related_parameters: - "No" ui_display_name: Not Displayed - layers: + fc_layers: ui_display_name: null + expected_impact: 1 norm: default_value_reasoning: While batch normalization and layer normalization @@ -2234,6 +2235,7 @@ DenseEncoder: ui_display_name: Output Size use_bias: ui_display_name: null + expected_impact: 1 weights_initializer: default_value_reasoning: Taken from [this paper](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf). description_implications: From 156a2f21873880bfb44a0d5ea56c221e518a9017 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 09:38:56 -0700 Subject: [PATCH 13/22] Combiner metadata fix --- ludwig/schema/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ludwig/schema/utils.py b/ludwig/schema/utils.py index 17238e70ea5..cf4fcebe5f2 100644 --- a/ludwig/schema/utils.py +++ b/ludwig/schema/utils.py @@ -801,6 +801,7 @@ def _deserialize(self, value, attr, data, **kwargs): def _jsonschema_type_mapping(self): initializers = list(initializer_registry.keys()) + param_metadata = convert_metadata_to_json(parameter_metadata) if parameter_metadata else None return { "oneOf": [ # Note: default not provided in the custom dict option: @@ -813,6 +814,7 @@ def _jsonschema_type_mapping(self): "title": f"{self.name}_custom_option", "additionalProperties": True, "description": "Customize an existing initializer.", + "parameter_metadata": param_metadata, }, { "type": "string", @@ -820,6 +822,7 @@ def _jsonschema_type_mapping(self): "default": default, "title": f"{self.name}_preconfigured_option", "description": "Pick a preconfigured initializer.", + "parameter_metadata": param_metadata, }, ], "title": self.name, @@ -830,7 +833,13 @@ def _jsonschema_type_mapping(self): return field( metadata={ "marshmallow_field": InitializerOptionsOrCustomDictField( - allow_none=False, load_default=default, dump_default=default, metadata={"description": description} + allow_none=False, + load_default=default, + dump_default=default, + metadata={ + "description": description, + "parameter_metadata": convert_metadata_to_json(parameter_metadata) if parameter_metadata else None, + }, ) }, default=default, From d507aee1263139f4d3ba93fd3cab892681fd41a7 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 09:56:36 -0700 Subject: [PATCH 14/22] fix decoders --- ludwig/schema/decoders/base.py | 50 +++++++++++++++----- ludwig/schema/metadata/configs/decoders.yaml | 29 ++++++++++++ 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index cc2bcf6f751..cd6631a0b39 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -15,44 +15,71 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): """Base class for decoders.""" - type: str + type: str = schema_utils.StringOptions( + [], + description="The type of decoder to use.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"] + ) fc_layers: List[Dict[str, Any]] = schema_utils.DictList( - default=None, description="List of dictionaries containing the parameters for each fully connected layer." + default=None, + description="List of dictionaries containing the parameters for each fully connected layer.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"] ) num_fc_layers: int = schema_utils.NonNegativeInteger( - default=0, description="Number of fully-connected layers if fc_layers not specified." + default=0, + description="Number of fully-connected layers if fc_layers not specified.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"] ) - fc_output_size: int = schema_utils.PositiveInteger(default=256, description="Output size of fully connected stack.") + fc_output_size: int = schema_utils.PositiveInteger( + default=256, + description="Output size of fully connected stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"] + ) fc_use_bias: bool = schema_utils.Boolean( - default=True, description="Whether the layer uses a bias vector in the fc_stack." + default=True, + description="Whether the layer uses a bias vector in the fc_stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"] ) fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( - default="xavier_uniform", description="The weights initializer to use for the layers in the fc_stack" + default="xavier_uniform", + description="The weights initializer to use for the layers in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] ) fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( - default="zeros", description="The bias initializer to use for the layers in the fc_stack" + default="zeros", + description="The bias initializer to use for the layers in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] ) fc_norm: str = schema_utils.StringOptions( - ["batch", "layer"], description="The normalization to use for the layers in the fc_stack" + ["batch", "layer"], + description="The normalization to use for the layers in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"] ) fc_norm_params: dict = schema_utils.Dict( - description="The additional parameters for the normalization in the fc_stack" + description="The additional parameters for the normalization in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"] ) fc_activation: str = schema_utils.ActivationOptions( - default="relu", description="The activation to use for the layers in the fc_stack" + default="relu", + description="The activation to use for the layers in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"] ) fc_dropout: float = schema_utils.FloatRange( - default=0.0, min=0, max=1, description="The dropout rate to use for the layers in the fc_stack" + default=0.0, + min=0, + max=1, + description="The dropout rate to use for the layers in the fc_stack", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"] ) @@ -70,6 +97,7 @@ def module_name(cls): "passthrough", description="The passthrough decoder simply returns the raw numerical values coming from the combiner as " "outputs", + parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"] ) input_size: int = schema_utils.PositiveInteger( diff --git a/ludwig/schema/metadata/configs/decoders.yaml b/ludwig/schema/metadata/configs/decoders.yaml index f2a70b9a68b..14de6f92606 100644 --- a/ludwig/schema/metadata/configs/decoders.yaml +++ b/ludwig/schema/metadata/configs/decoders.yaml @@ -1,3 +1,26 @@ +BaseDecoder: + type: + expected_impact: 1 + fc_layers: + expected_impact: 1 + num_fc_layers: + expected_impact: 3 + fc_output_size: + expected_impact: 3 + fc_use_bias: + expected_impact: 1 + fc_weights_initializer: + expected_impact: 1 + fc_bias_initializer: + expected_impact: 1 + fc_norm: + expected_impact: 2 + fc_norm_params: + expected_impact: 1 + fc_activation: + expected_impact: 2 + fc_dropout: + expected_impact: 3 Classifier: type: short_description: @@ -5,6 +28,7 @@ Classifier: long_description: The classifier decoder is a (potentially empty) stack of fully connected layers, followed by a projection into a vector of size of the number of available classes, followed by a sigmoid. + expected_impact: 0 bias_initializer: default_value_reasoning: It is possible and common to initialize the biases @@ -102,6 +126,7 @@ Projector: The Projector decoder is a (potentially empty) stack of fully connected layers, followed by a projection into a tensor of the vector size (optionally followed by a softmax in the case of multi-class classification). + expected_impact: 0 activation: description_implications: Changing the activation functions has an impact @@ -230,6 +255,7 @@ Regressor: long_description: The regressor decoder is a (potentially empty) stack of fully connected layers, followed by a projection to a single number. + expected_impact: 0 activation: ui_display_name: null expected_impact: 2 @@ -324,6 +350,7 @@ PassthroughDecoder: Provides the raw input from the combiner. long_description: The passthrough decoder simply returns the raw output coming from the combiner. + expected_impact: 0 input_size: other_information: Internal Only internal_only: true @@ -349,6 +376,7 @@ SequenceGeneratorDecoder: feature without reduced outputs or the output of a sequence-based combiner. If a `b x h` input is provided to a generator decoder using an RNN with attention instead, an error will be raised during model building. + expected_impact: 0 cell_type: ui_display_name: null expected_impact: 3 @@ -405,6 +433,7 @@ SequenceTaggerDecoder: a hidden dimension, which is the output of a sequence, text or time series input feature without reduced outputs or the output of a sequence-based combiner. If a `b x h` input is provided instead, an error will be raised during model building. + expected_impact: 0 attention_embedding_size: default_value_reasoning: Not too big, not too small. description_implications: From 4f2d6e1bd3dc653f2dfa7fdc214804113797c7c0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 16:57:33 +0000 Subject: [PATCH 15/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/schema/decoders/base.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index cd6631a0b39..10228ecfd58 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -16,62 +16,60 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): """Base class for decoders.""" type: str = schema_utils.StringOptions( - [], - description="The type of decoder to use.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"] + [], description="The type of decoder to use.", parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"] ) fc_layers: List[Dict[str, Any]] = schema_utils.DictList( default=None, description="List of dictionaries containing the parameters for each fully connected layer.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_layers"], ) num_fc_layers: int = schema_utils.NonNegativeInteger( default=0, description="Number of fully-connected layers if fc_layers not specified.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["num_fc_layers"], ) fc_output_size: int = schema_utils.PositiveInteger( default=256, description="Output size of fully connected stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_output_size"], ) fc_use_bias: bool = schema_utils.Boolean( default=True, description="Whether the layer uses a bias vector in the fc_stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"], ) fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="xavier_uniform", description="The weights initializer to use for the layers in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"], ) fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( default="zeros", description="The bias initializer to use for the layers in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"], ) fc_norm: str = schema_utils.StringOptions( ["batch", "layer"], description="The normalization to use for the layers in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm"], ) fc_norm_params: dict = schema_utils.Dict( description="The additional parameters for the normalization in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_norm_params"], ) fc_activation: str = schema_utils.ActivationOptions( default="relu", description="The activation to use for the layers in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_activation"], ) fc_dropout: float = schema_utils.FloatRange( @@ -79,7 +77,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): min=0, max=1, description="The dropout rate to use for the layers in the fc_stack", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_dropout"], ) @@ -97,7 +95,7 @@ def module_name(cls): "passthrough", description="The passthrough decoder simply returns the raw numerical values coming from the combiner as " "outputs", - parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"] + parameter_metadata=DECODER_METADATA["PassthroughDecoder"]["type"], ) input_size: int = schema_utils.PositiveInteger( From 1c26d29a7d820a03ddff011a4e7ffd5164de311f Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 10:01:45 -0700 Subject: [PATCH 16/22] fix --- ludwig/schema/decoders/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index cd6631a0b39..10342646cdd 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -16,7 +16,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): """Base class for decoders.""" type: str = schema_utils.StringOptions( - [], + ["regressor", "classifier", "projector", "generator", "tagger"], description="The type of decoder to use.", parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"] ) From 71bef9447ab950cf9ff31169cb5eda179700482c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 17:04:29 +0000 Subject: [PATCH 17/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/schema/decoders/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index 5eaef9d0139..46ccbfc0fbf 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -18,7 +18,7 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): type: str = schema_utils.StringOptions( ["regressor", "classifier", "projector", "generator", "tagger"], description="The type of decoder to use.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["type"], ) fc_layers: List[Dict[str, Any]] = schema_utils.DictList( From 3d2c68ed94ca2b5a4950127a3f7924f36b37d8e4 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 10:57:51 -0700 Subject: [PATCH 18/22] Initializer fix --- ludwig/schema/decoders/base.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index 5eaef9d0139..34381159104 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -45,15 +45,35 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_use_bias"], ) - fc_weights_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( + fc_weights_initializer: Union[str, Dict] = schema_utils.OneOfOptionsField( default="xavier_uniform", description="The weights initializer to use for the layers in the fc_stack", + field_options=[ + schema_utils.InitializerOptions( + description="Preconfigured initializer to use for the layers in the fc_stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] + ), + schema_utils.Dict( + description="Custom initializer to use for the layers in the fc_stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] + ), + ], parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"], ) - fc_bias_initializer: Union[str, Dict] = schema_utils.InitializerOrDict( + fc_bias_initializer: Union[str, Dict] = schema_utils.OneOfOptionsField( default="zeros", description="The bias initializer to use for the layers in the fc_stack", + field_options=[ + schema_utils.InitializerOptions( + description="Preconfigured bias initializer to use for the layers in the fc_stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] + ), + schema_utils.Dict( + description="Custom bias initializer to use for the layers in the fc_stack.", + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] + ), + ], parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"], ) From 3e2d32be03fc9cb53807c983726cf6707b1af717 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 17:59:06 +0000 Subject: [PATCH 19/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/schema/decoders/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ludwig/schema/decoders/base.py b/ludwig/schema/decoders/base.py index 851617281ba..5962c6d0fa6 100644 --- a/ludwig/schema/decoders/base.py +++ b/ludwig/schema/decoders/base.py @@ -51,11 +51,11 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): field_options=[ schema_utils.InitializerOptions( description="Preconfigured initializer to use for the layers in the fc_stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"], ), schema_utils.Dict( description="Custom initializer to use for the layers in the fc_stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"], ), ], parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_weights_initializer"], @@ -67,11 +67,11 @@ class BaseDecoderConfig(schema_utils.BaseMarshmallowConfig, ABC): field_options=[ schema_utils.InitializerOptions( description="Preconfigured bias initializer to use for the layers in the fc_stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"], ), schema_utils.Dict( description="Custom bias initializer to use for the layers in the fc_stack.", - parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"] + parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"], ), ], parameter_metadata=DECODER_METADATA["BaseDecoder"]["fc_bias_initializer"], From 9e80f4bea744027f563a7b898d1d551af271a760 Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 11:21:47 -0700 Subject: [PATCH 20/22] Loss param metadata --- ludwig/schema/features/loss/loss.py | 64 +++++++++++++++++++++--- ludwig/schema/metadata/__init__.py | 1 + ludwig/schema/metadata/configs/loss.yaml | 54 ++++++++++++++++++++ 3 files changed, 111 insertions(+), 8 deletions(-) create mode 100644 ludwig/schema/metadata/configs/loss.yaml diff --git a/ludwig/schema/features/loss/loss.py b/ludwig/schema/features/loss/loss.py index 1244857fe6b..519f5eb38cf 100644 --- a/ludwig/schema/features/loss/loss.py +++ b/ludwig/schema/features/loss/loss.py @@ -14,6 +14,7 @@ SOFTMAX_CROSS_ENTROPY, ) from ludwig.schema import utils as schema_utils +from ludwig.schema.metadata import LOSS_METADATA @DeveloperAPI @@ -37,6 +38,7 @@ class MSELossConfig(BaseLossConfig): weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["MSELoss"]["weight"], ) @@ -51,6 +53,7 @@ class MAELossConfig(BaseLossConfig): weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["MAELoss"]["weight"], ) @@ -65,6 +68,7 @@ class RMSELossConfig(BaseLossConfig): weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["RMSELoss"]["weight"], ) @@ -79,6 +83,7 @@ class RMSPELossConfig(BaseLossConfig): weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["RMSPELoss"]["weight"], ) @@ -93,15 +98,25 @@ class BWCEWLossConfig(BaseLossConfig): positive_class_weight: int = schema_utils.NonNegativeInteger( default=None, description="Weight of the positive class.", + parameter_metadata=LOSS_METADATA["BWCEWLoss"]["positive_class_weight"], ) - robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="") + robust_lambda: int = schema_utils.NonNegativeInteger( + default=0, + description="", + parameter_metadata=LOSS_METADATA["BWCEWLoss"]["robust_lambda"], + ) - confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="") + confidence_penalty: float = schema_utils.NonNegativeFloat( + default=0, + description="", + parameter_metadata=LOSS_METADATA["BWCEWLoss"]["confidence_penalty"], + ) weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["BWCEWLoss"]["weight"], ) @@ -117,24 +132,39 @@ class SoftmaxCrossEntropyLossConfig(BaseLossConfig): list_type=float, default=None, description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_weights"], ) - robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="") + robust_lambda: int = schema_utils.NonNegativeInteger( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["robust_lambda"], + ) - confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="") + confidence_penalty: float = schema_utils.NonNegativeFloat( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["confidence_penalty"], + ) class_similarities: list = schema_utils.List( list, default=None, description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the " "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_similarities"], ) - class_similarities_temperature: int = schema_utils.NonNegativeInteger(default=0, description="") + class_similarities_temperature: int = schema_utils.NonNegativeInteger( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["class_similarities_temperature"], + ) weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["SoftmaxCrossEntropyLoss"]["weight"], ) @@ -150,29 +180,45 @@ class SequenceSoftmaxCrossEntropyLossConfig(BaseLossConfig): list_type=float, default=None, description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_weights"], ) - robust_lambda: int = schema_utils.NonNegativeInteger(default=0, description="") + robust_lambda: int = schema_utils.NonNegativeInteger( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["robust_lambda"], + ) - confidence_penalty: float = schema_utils.NonNegativeFloat(default=0, description="") + confidence_penalty: float = schema_utils.NonNegativeFloat( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["confidence_penalty"], + ) class_similarities: list = schema_utils.List( list, default=None, description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the " "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_similarities"], ) - class_similarities_temperature: int = schema_utils.NonNegativeInteger(default=0, description="") + class_similarities_temperature: int = schema_utils.NonNegativeInteger( + default=0, + description="", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["class_similarities_temperature"], + ) weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["weight"], ) unique: bool = schema_utils.Boolean( default=False, description="If true, the loss is only computed for unique elements in the sequence.", + parameter_metadata=LOSS_METADATA["SequenceSoftmaxCrossEntropyLoss"]["unique"], ) @@ -188,9 +234,11 @@ class SigmoidCrossEntropyLossConfig(BaseLossConfig): list_type=float, default=None, description="Weights to apply to each class in the loss. If not specified, all classes are weighted equally.", + parameter_metadata=LOSS_METADATA["SigmoidCrossEntropyLoss"]["class_weights"], ) weight: float = schema_utils.NonNegativeFloat( default=1.0, description="Weight of the loss.", + parameter_metadata=LOSS_METADATA["SigmoidCrossEntropyLoss"]["weight"], ) diff --git a/ludwig/schema/metadata/__init__.py b/ludwig/schema/metadata/__init__.py index fbeb96ed964..367a2ba5097 100644 --- a/ludwig/schema/metadata/__init__.py +++ b/ludwig/schema/metadata/__init__.py @@ -34,3 +34,4 @@ def _load(fname: str) -> Dict[str, Any]: PREPROCESSING_METADATA = _load("preprocessing.yaml") TRAINER_METADATA = _load("trainer.yaml") OPTIMIZER_METADATA = _load("optimizers.yaml") +LOSS_METADATA = _load("loss.yaml") diff --git a/ludwig/schema/metadata/configs/loss.yaml b/ludwig/schema/metadata/configs/loss.yaml new file mode 100644 index 00000000000..128cd055843 --- /dev/null +++ b/ludwig/schema/metadata/configs/loss.yaml @@ -0,0 +1,54 @@ +MSELoss: + weight: + expected_impact: 3 +MAELoss: + weight: + expected_impact: 3 +RMSELoss: + weight: + expected_impact: 3 +RMSPELoss: + weight: + expected_impact: 3 +BWCEWLoss: + positive_class_weight: + expected_impact: 3 + robust_lambda: + expected_impact: 2 + confidence_penalty: + expected_impact: 2 + weight: + expected_impact: 3 +SoftmaxCrossEntropyLoss: + class_weights: + expected_impact: 3 + robust_lambda: + expected_impact: 2 + confidence_penalty: + expected_impact: 2 + class_similarities: + expected_impact: 2 + class_similarities_temperature: + expected_impact: 2 + weight: + expected_impact: 3 +SequenceSoftmaxCrossEntropyLoss: + class_weights: + expected_impact: 3 + robust_lambda: + expected_impact: 2 + confidence_penalty: + expected_impact: 2 + class_similarities: + expected_impact: 2 + class_similarities_temperature: + expected_impact: 2 + weight: + expected_impact: 3 + unique: + expected_impact: 2 +SigmoidCrossEntropyLoss: + class_weights: + expected_impact: 3 + weight: + expected_impact: 3 From 90bd37d590069c32122f8b4e8401b703fe4af6be Mon Sep 17 00:00:00 2001 From: connor-mccorm Date: Thu, 19 Jan 2023 12:35:04 -0700 Subject: [PATCH 21/22] Output Feature params --- ludwig/schema/features/binary_feature.py | 6 ++ ludwig/schema/features/category_feature.py | 6 ++ ludwig/schema/features/number_feature.py | 5 ++ ludwig/schema/features/sequence_feature.py | 4 ++ ludwig/schema/features/set_feature.py | 5 ++ ludwig/schema/features/text_feature.py | 5 ++ ludwig/schema/features/vector_feature.py | 6 ++ ludwig/schema/metadata/configs/features.yaml | 60 ++++++++++++++++++++ 8 files changed, 97 insertions(+) diff --git a/ludwig/schema/features/binary_feature.py b/ludwig/schema/features/binary_feature.py index c3e922ec361..423d85588b3 100644 --- a/ludwig/schema/features/binary_feature.py +++ b/ludwig/schema/features/binary_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -73,6 +74,7 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf calibration: bool = schema_utils.Boolean( default=False, description="Calibrate the model's output probabilities using temperature scaling.", + parameter_metadata=FEATURE_METADATA[BINARY]["calibration"], ) default_validation_metric: str = schema_utils.StringOptions( @@ -85,6 +87,7 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[BINARY]["dependencies"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="binary_output") @@ -92,12 +95,14 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[BINARY]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[BINARY]["reduce_input"], ) threshold: float = schema_utils.FloatRange( @@ -106,4 +111,5 @@ class BinaryOutputFeatureConfig(BaseOutputFeatureConfig, BinaryOutputFeatureConf max=1, description="The threshold used to convert output probabilities to predictions. Predicted probabilities greater" "than or equal to threshold are mapped to True.", + parameter_metadata=FEATURE_METADATA[BINARY]["threshold"], ) diff --git a/ludwig/schema/features/category_feature.py b/ludwig/schema/features/category_feature.py index a3ab732d184..4fbb23ea308 100644 --- a/ludwig/schema/features/category_feature.py +++ b/ludwig/schema/features/category_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -75,6 +76,7 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature calibration: bool = schema_utils.Boolean( default=False, description="Calibrate the model's output probabilities using temperature scaling.", + parameter_metadata=FEATURE_METADATA[CATEGORY]["calibration"], ) default_validation_metric: str = schema_utils.StringOptions( @@ -87,6 +89,7 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[CATEGORY]["dependencies"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="category_output") @@ -94,12 +97,14 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[CATEGORY]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[CATEGORY]["reduce_input"], ) top_k: int = schema_utils.NonNegativeInteger( @@ -107,4 +112,5 @@ class CategoryOutputFeatureConfig(BaseOutputFeatureConfig, CategoryOutputFeature description="Determines the parameter k, the number of categories to consider when computing the top_k " "measure. It computes accuracy but considering as a match if the true category appears in the " "first k predicted categories ranked by decoder's confidence.", + parameter_metadata=FEATURE_METADATA[CATEGORY]["top_k"], ) diff --git a/ludwig/schema/features/number_feature.py b/ludwig/schema/features/number_feature.py index 604583b85dc..2bd51ab4689 100644 --- a/ludwig/schema/features/number_feature.py +++ b/ludwig/schema/features/number_feature.py @@ -21,6 +21,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -80,6 +81,7 @@ class NumberOutputFeatureConfig(BaseOutputFeatureConfig, NumberOutputFeatureConf min=0, max=999999999, description="Clip the predicted output to the specified range.", + parameter_metadata=FEATURE_METADATA[NUMBER]["clip"], ) default_validation_metric: str = schema_utils.StringOptions( @@ -92,17 +94,20 @@ class NumberOutputFeatureConfig(BaseOutputFeatureConfig, NumberOutputFeatureConf dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[NUMBER]["dependencies"], ) reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[NUMBER]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[NUMBER]["reduce_input"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="number_output") diff --git a/ludwig/schema/features/sequence_feature.py b/ludwig/schema/features/sequence_feature.py index 787d34d5293..7c5b00b31a6 100644 --- a/ludwig/schema/features/sequence_feature.py +++ b/ludwig/schema/features/sequence_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -82,6 +83,7 @@ class SequenceOutputFeatureConfig(BaseOutputFeatureConfig, SequenceOutputFeature dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[SEQUENCE]["dependencies"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="sequence_output") @@ -89,10 +91,12 @@ class SequenceOutputFeatureConfig(BaseOutputFeatureConfig, SequenceOutputFeature reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[SEQUENCE]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[SEQUENCE]["reduce_input"], ) diff --git a/ludwig/schema/features/set_feature.py b/ludwig/schema/features/set_feature.py index 5c2ea6faf7f..da0faf8e27f 100644 --- a/ludwig/schema/features/set_feature.py +++ b/ludwig/schema/features/set_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -80,6 +81,7 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[SET]["dependencies"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="set_output") @@ -87,12 +89,14 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[SET]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[SET]["reduce_input"], ) threshold: float = schema_utils.FloatRange( @@ -101,4 +105,5 @@ class SetOutputFeatureConfig(BaseOutputFeatureConfig, SetOutputFeatureConfigMixi max=1, description="The threshold used to convert output probabilities to predictions. Tokens with predicted" "probabilities greater than or equal to threshold are predicted to be in the output set (True).", + parameter_metadata=FEATURE_METADATA[SET]["threshold"], ) diff --git a/ludwig/schema/features/text_feature.py b/ludwig/schema/features/text_feature.py index 62d1ae39858..a1428b37cb8 100644 --- a/ludwig/schema/features/text_feature.py +++ b/ludwig/schema/features/text_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -75,6 +76,7 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi default=None, description="If not null this parameter is a c x c matrix in the form of a list of lists that contains the " "mutual similarity of classes. It is used if `class_similarities_temperature` is greater than 0. ", + parameter_metadata=FEATURE_METADATA[TEXT]["class_similarities"], ) default_validation_metric: str = schema_utils.StringOptions( @@ -87,6 +89,7 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[TEXT]["dependencies"], ) preprocessing: BasePreprocessingConfig = PreprocessingDataclassField(feature_type="text_output") @@ -94,10 +97,12 @@ class TextOutputFeatureConfig(BaseOutputFeatureConfig, TextOutputFeatureConfigMi reduce_dependencies: str = schema_utils.ReductionOptions( default="sum", description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[TEXT]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default="sum", description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[TEXT]["reduce_input"], ) diff --git a/ludwig/schema/features/vector_feature.py b/ludwig/schema/features/vector_feature.py index 160d8035f9d..def4c5ff90a 100644 --- a/ludwig/schema/features/vector_feature.py +++ b/ludwig/schema/features/vector_feature.py @@ -19,6 +19,7 @@ output_mixin_registry, ) from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY +from ludwig.schema.metadata import FEATURE_METADATA from ludwig.schema.utils import BaseMarshmallowConfig @@ -73,6 +74,7 @@ class VectorOutputFeatureConfig(BaseOutputFeatureConfig, VectorOutputFeatureConf dependencies: list = schema_utils.List( default=[], description="List of input features that this feature depends on.", + parameter_metadata=FEATURE_METADATA[VECTOR]["dependencies"], ) default_validation_metric: str = schema_utils.StringOptions( @@ -87,22 +89,26 @@ class VectorOutputFeatureConfig(BaseOutputFeatureConfig, VectorOutputFeatureConf reduce_dependencies: str = schema_utils.ReductionOptions( default=None, description="How to reduce the dependencies of the output feature.", + parameter_metadata=FEATURE_METADATA[VECTOR]["reduce_dependencies"], ) reduce_input: str = schema_utils.ReductionOptions( default=None, description="How to reduce an input that is not a vector, but a matrix or a higher order tensor, on the first " "dimension (second if you count the batch dimension)", + parameter_metadata=FEATURE_METADATA[VECTOR]["reduce_input"], ) softmax: bool = schema_utils.Boolean( default=False, description="Determines whether to apply a softmax at the end of the decoder. This is useful for predicting a " "vector of values that sum up to 1 and can be interpreted as probabilities.", + parameter_metadata=FEATURE_METADATA[VECTOR]["softmax"], ) vector_size: int = schema_utils.PositiveInteger( default=None, allow_none=True, description="The size of the vector. If None, the vector size will be inferred from the data.", + parameter_metadata=FEATURE_METADATA[VECTOR]["vector_size"], ) diff --git a/ludwig/schema/metadata/configs/features.yaml b/ludwig/schema/metadata/configs/features.yaml index 9674fb97bb2..8fb13109e22 100644 --- a/ludwig/schema/metadata/configs/features.yaml +++ b/ludwig/schema/metadata/configs/features.yaml @@ -201,6 +201,16 @@ binary: - fill_value ui_display_name: Missing Value Strategy expected_impact: 3 + calibration: + expected_impact: 2 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 + threshold: + expected_impact: 3 category: preprocessing: computed_fill_value: @@ -252,6 +262,16 @@ category: will leave out only very rare tokens that should not influence performance substantially ui_display_name: Most common (vocabulary size) + calibration: + expected_impact: 2 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 + top_k: + expected_impact: 3 date: preprocessing: computed_fill_value: @@ -434,6 +454,14 @@ number: \ std = 1. It\u2019s useful when there are a few outliers, but not\ \ so extreme that you need clipping." ui_display_name: Normalization + clip: + expected_impact: 2 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 sequence: preprocessing: computed_fill_value: @@ -533,6 +561,12 @@ sequence: rather than treated as an unknown. expected_impact: 0 ui_display_name: Vocab File + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 set: preprocessing: computed_fill_value: @@ -586,6 +620,14 @@ set: tokenizer: ui_display_name: null expected_impact: 3 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 + threshold: + expected_impact: 3 text: preprocessing: computed_fill_value: @@ -753,6 +795,14 @@ text: rather than treated as an unknown. expected_impact: 0 ui_display_name: Vocab File + class_similarities: + expected_impact: 1 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 timeseries: preprocessing: computed_fill_value: @@ -814,3 +864,13 @@ vector: vector_size: ui_display_name: null expected_impact: 3 + dependencies: + expected_impact: 1 + reduce_dependencies: + expected_impact: 1 + reduce_input: + expected_impact: 1 + softmax: + expected_impact: 3 + vector_size: + expected_impact: 3 From 525c4b7961614b4f12b623169f2339780f8b15c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 19 Jan 2023 19:36:42 +0000 Subject: [PATCH 22/22] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ludwig/schema/features/binary_feature.py | 2 +- ludwig/schema/features/category_feature.py | 2 +- ludwig/schema/features/number_feature.py | 2 +- ludwig/schema/features/sequence_feature.py | 2 +- ludwig/schema/features/set_feature.py | 2 +- ludwig/schema/features/text_feature.py | 2 +- ludwig/schema/features/vector_feature.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ludwig/schema/features/binary_feature.py b/ludwig/schema/features/binary_feature.py index 423d85588b3..4c852de7f6d 100644 --- a/ludwig/schema/features/binary_feature.py +++ b/ludwig/schema/features/binary_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/category_feature.py b/ludwig/schema/features/category_feature.py index 4fbb23ea308..c23c861d674 100644 --- a/ludwig/schema/features/category_feature.py +++ b/ludwig/schema/features/category_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/number_feature.py b/ludwig/schema/features/number_feature.py index 2bd51ab4689..e54e32403f3 100644 --- a/ludwig/schema/features/number_feature.py +++ b/ludwig/schema/features/number_feature.py @@ -20,8 +20,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/sequence_feature.py b/ludwig/schema/features/sequence_feature.py index 7c5b00b31a6..d8ea9d2718d 100644 --- a/ludwig/schema/features/sequence_feature.py +++ b/ludwig/schema/features/sequence_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/set_feature.py b/ludwig/schema/features/set_feature.py index da0faf8e27f..ff38a562637 100644 --- a/ludwig/schema/features/set_feature.py +++ b/ludwig/schema/features/set_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/text_feature.py b/ludwig/schema/features/text_feature.py index a1428b37cb8..b386d13fc41 100644 --- a/ludwig/schema/features/text_feature.py +++ b/ludwig/schema/features/text_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig diff --git a/ludwig/schema/features/vector_feature.py b/ludwig/schema/features/vector_feature.py index def4c5ff90a..629fda078cb 100644 --- a/ludwig/schema/features/vector_feature.py +++ b/ludwig/schema/features/vector_feature.py @@ -18,8 +18,8 @@ output_config_registry, output_mixin_registry, ) -from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.metadata import FEATURE_METADATA +from ludwig.schema.metadata.parameter_metadata import INTERNAL_ONLY from ludwig.schema.utils import BaseMarshmallowConfig