@@ -54,9 +54,6 @@ class GemmaConfig(PretrainedConfig):
5454            The attention head dimension. 
5555        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): 
5656            The legacy activation function. It is overwritten by the `hidden_activation`. 
57-         hidden_activation (`str` or `function`, *optional*): 
58-             The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"` 
59-             if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function. 
6057        max_position_embeddings (`int`, *optional*, defaults to 8192): 
6158            The maximum sequence length that this model might ever be used with. 
6259        initializer_range (`float`, *optional*, defaults to 0.02): 
@@ -117,7 +114,6 @@ def __init__(
117114        num_key_value_heads = 16 ,
118115        head_dim = 256 ,
119116        hidden_act = "gelu_pytorch_tanh" ,
120-         hidden_activation = None ,
121117        max_position_embeddings = 8192 ,
122118        initializer_range = 0.02 ,
123119        rms_norm_eps = 1e-6 ,
@@ -140,7 +136,6 @@ def __init__(
140136        self .head_dim  =  head_dim 
141137        self .num_key_value_heads  =  num_key_value_heads 
142138        self .hidden_act  =  hidden_act 
143-         self .hidden_activation  =  hidden_activation 
144139        self .initializer_range  =  initializer_range 
145140        self .rms_norm_eps  =  rms_norm_eps 
146141        self .use_cache  =  use_cache 
0 commit comments