Skip to content

Commit

Permalink
Propagate attention_dropout flag for GPT-3 (#5669)
Browse files Browse the repository at this point in the history
* Propagate attention_dropout flag for GPT-3

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* Add default to megatron_gpt_config

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
  • Loading branch information
3 people authored Dec 20, 2022
1 parent 1d816db commit 80cd1c1
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ model:
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
use_scaled_init_method: True # use scaled residuals initialization
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
attention_dropout: 0.1 # Dropout probability for attention
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
normalization: layernorm # Type of normalization layers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(
fp16_lm_cross_entropy=False,
use_cpu_initialization=False,
hidden_dropout=0.1,
attention_dropout=0.1,
precision=16,
fp32_residual_connection=False,
activations_checkpoint_granularity=None,
Expand Down Expand Up @@ -165,6 +166,7 @@ def __init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
num_tokentypes=num_tokentypes,
max_position_embeddings=max_position_embeddings,
num_layers=num_layers,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ def model_provider_func(self, pre_process, post_process):
fp16_lm_cross_entropy=self.cfg.get('fp16_lm_cross_entropy', False),
use_cpu_initialization=self.cfg.get('use_cpu_initialization', False),
hidden_dropout=self.cfg.get('hidden_dropout', 0.1),
attention_dropout=self.cfg.get('attention_dropout', 0.1),
precision=self.cfg.get('precision', 16),
fp32_residual_connection=self.cfg.get('fp32_residual_connection', False),
activations_checkpoint_granularity=self.cfg.get('activations_checkpoint_granularity', None),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def get_language_model(
init_method_std=0.02,
use_cpu_initialization=False,
hidden_dropout=0.1,
attention_dropout=0.1,
precision=16,
fp32_residual_connection=False,
activations_checkpoint_method=None,
Expand Down Expand Up @@ -122,6 +123,7 @@ def get_language_model(
post_process=post_process,
use_cpu_initialization=use_cpu_initialization,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
precision=precision,
fp32_residual_connection=fp32_residual_connection,
activations_checkpoint_method=activations_checkpoint_method,
Expand Down Expand Up @@ -410,6 +412,7 @@ def __init__(
post_process=True,
use_cpu_initialization=False,
hidden_dropout=0.1,
attention_dropout=0.1,
precision=16,
fp32_residual_connection=False,
activations_checkpoint_method=None,
Expand Down Expand Up @@ -497,6 +500,7 @@ def __init__(
normalization=normalization,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
use_cpu_initialization=use_cpu_initialization,
persist_layer_norm=persist_layer_norm,
openai_gelu=openai_gelu,
Expand Down Expand Up @@ -544,6 +548,7 @@ def __init__(
normalization=normalization,
layernorm_epsilon=layernorm_epsilon,
hidden_dropout=hidden_dropout,
attention_dropout=attention_dropout,
use_cpu_initialization=use_cpu_initialization,
bias_activation_fusion=bias_activation_fusion,
bias_dropout_add_fusion=bias_dropout_add_fusion,
Expand Down

0 comments on commit 80cd1c1

Please sign in to comment.