Skip to content

Commit ffe60fd

Browse files
authored
v4.39 deprecations 🧼 (#29492)
1 parent 979fccc commit ffe60fd

File tree

14 files changed

+9
-400
lines changed

14 files changed

+9
-400
lines changed

docs/source/en/internal/generation_utils.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -336,12 +336,6 @@ A [`Constraint`] can be used to force the generation to include specific tokens
336336
- process
337337
- finalize
338338

339-
## Utilities
340-
341-
[[autodoc]] top_k_top_p_filtering
342-
343-
[[autodoc]] tf_top_k_top_p_filtering
344-
345339
## Streamers
346340

347341
[[autodoc]] TextStreamer

docs/source/ja/internal/generation_utils.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -335,12 +335,6 @@ generation_output[:2]
335335
- process
336336
- finalize
337337

338-
## Utilities
339-
340-
[[autodoc]] top_k_top_p_filtering
341-
342-
[[autodoc]] tf_top_k_top_p_filtering
343-
344338
## Streamers
345339

346340
[[autodoc]] TextStreamer

docs/source/zh/internal/generation_utils.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -330,12 +330,6 @@ generation_output[:2]
330330
- process
331331
- finalize
332332

333-
## Utilities
334-
335-
[[autodoc]] top_k_top_p_filtering
336-
337-
[[autodoc]] tf_top_k_top_p_filtering
338-
339333
## Streamers
340334

341335
[[autodoc]] TextStreamer

src/transformers/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1409,7 +1409,6 @@
14091409
"TypicalLogitsWarper",
14101410
"UnbatchedClassifierFreeGuidanceLogitsProcessor",
14111411
"WhisperTimeStampLogitsProcessor",
1412-
"top_k_top_p_filtering",
14131412
]
14141413
)
14151414
_import_structure["generation_utils"] = []
@@ -3814,7 +3813,6 @@
38143813
"TFTemperatureLogitsWarper",
38153814
"TFTopKLogitsWarper",
38163815
"TFTopPLogitsWarper",
3817-
"tf_top_k_top_p_filtering",
38183816
]
38193817
)
38203818
_import_structure["generation_tf_utils"] = []
@@ -6206,7 +6204,6 @@
62066204
TypicalLogitsWarper,
62076205
UnbatchedClassifierFreeGuidanceLogitsProcessor,
62086206
WhisperTimeStampLogitsProcessor,
6209-
top_k_top_p_filtering,
62106207
)
62116208
from .modeling_utils import PreTrainedModel
62126209
from .models.albert import (
@@ -8178,7 +8175,6 @@
81788175
TFTemperatureLogitsWarper,
81798176
TFTopKLogitsWarper,
81808177
TFTopPLogitsWarper,
8181-
tf_top_k_top_p_filtering,
81828178
)
81838179
from .keras_callbacks import KerasMetricCallback, PushToHubCallback
81848180
from .modeling_tf_utils import (

src/transformers/activations.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414

1515
import math
16-
import warnings
1716
from collections import OrderedDict
1817

1918
import torch
@@ -138,14 +137,6 @@ def forward(self, input: Tensor) -> Tensor:
138137
return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
139138

140139

141-
class SiLUActivation(nn.SiLU):
142-
def __init__(self, *args, **kwargs):
143-
warnings.warn(
144-
"The SiLUActivation class has been deprecated and will be removed in v4.39. Please use nn.SiLU instead.",
145-
)
146-
super().__init__(*args, **kwargs)
147-
148-
149140
class MishActivation(nn.Module):
150141
"""
151142
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also

src/transformers/generation/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@
8888
]
8989
_import_structure["utils"] = [
9090
"GenerationMixin",
91-
"top_k_top_p_filtering",
9291
"GreedySearchEncoderDecoderOutput",
9392
"GreedySearchDecoderOnlyOutput",
9493
"SampleEncoderDecoderOutput",
@@ -130,7 +129,6 @@
130129
]
131130
_import_structure["tf_utils"] = [
132131
"TFGenerationMixin",
133-
"tf_top_k_top_p_filtering",
134132
"TFGreedySearchDecoderOnlyOutput",
135133
"TFGreedySearchEncoderDecoderOutput",
136134
"TFSampleEncoderDecoderOutput",
@@ -241,7 +239,6 @@
241239
GreedySearchEncoderDecoderOutput,
242240
SampleDecoderOnlyOutput,
243241
SampleEncoderDecoderOutput,
244-
top_k_top_p_filtering,
245242
)
246243

247244
try:
@@ -279,7 +276,6 @@
279276
TFGreedySearchEncoderDecoderOutput,
280277
TFSampleDecoderOnlyOutput,
281278
TFSampleEncoderDecoderOutput,
282-
tf_top_k_top_p_filtering,
283279
)
284280

285281
try:

src/transformers/generation/tf_utils.py

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -3088,68 +3088,6 @@ def contrastive_search_body_fn(
30883088
return generated
30893089

30903090

3091-
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
3092-
"""
3093-
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
3094-
3095-
Args:
3096-
logits: logits distribution shape (batch size, vocabulary size)
3097-
top_k (`int`, *optional*, defaults to 0):
3098-
If > 0, only keep the top k tokens with highest probability (top-k filtering)
3099-
top_p (`float`, *optional*, defaults to 1.0):
3100-
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
3101-
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
3102-
min_tokens_to_keep (`int`, *optional*, defaults to 1):
3103-
Minimumber of tokens we keep per batch example in the output.
3104-
3105-
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
3106-
"""
3107-
3108-
warnings.warn(
3109-
"`tf_top_k_top_p_filtering` is scheduled for deletion in v4.39. Use `TFTopKLogitsWarper` and "
3110-
"`TFTopPLogitsWarper` instead.",
3111-
DeprecationWarning,
3112-
)
3113-
3114-
logits_shape = shape_list(logits)
3115-
3116-
if top_k > 0:
3117-
top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1]) # Safety check
3118-
# Remove all tokens with a probability less than the last token of the top-k
3119-
indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
3120-
logits = tf.where(indices_to_remove, filter_value, logits)
3121-
if top_p < 1.0:
3122-
sorted_indices = tf.argsort(logits, direction="DESCENDING")
3123-
sorted_logits = tf.gather(
3124-
logits, sorted_indices, axis=-1, batch_dims=1
3125-
) # expects logits to be of dim (batch_size, vocab_size)
3126-
3127-
cumulative_probs = tf.math.cumsum(stable_softmax(sorted_logits, axis=-1), axis=-1)
3128-
3129-
# Remove tokens with cumulative probability above the threshold (token with 0 are kept)
3130-
sorted_indices_to_remove = cumulative_probs > top_p
3131-
3132-
if min_tokens_to_keep > 1:
3133-
# Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
3134-
sorted_indices_to_remove = tf.concat(
3135-
[
3136-
tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
3137-
sorted_indices_to_remove[:, min_tokens_to_keep:],
3138-
],
3139-
-1,
3140-
)
3141-
3142-
# Shift the indices to the right to keep also the first token above the threshold
3143-
sorted_indices_to_remove = tf.concat(
3144-
[tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, :-1]],
3145-
-1,
3146-
)
3147-
# scatter sorted tensors to original indexing
3148-
indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
3149-
logits = tf.where(indices_to_remove, filter_value, logits)
3150-
return logits
3151-
3152-
31533091
def scatter_values_on_batch_indices(values, batch_indices):
31543092
shape = shape_list(batch_indices)
31553093
# broadcast batch dim to shape

src/transformers/generation/utils.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4810,47 +4810,6 @@ def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_at
48104810
return outputs
48114811

48124812

4813-
def top_k_top_p_filtering(
4814-
logits: torch.FloatTensor,
4815-
top_k: int = 0,
4816-
top_p: float = 1.0,
4817-
filter_value: float = -float("Inf"),
4818-
min_tokens_to_keep: int = 1,
4819-
) -> torch.FloatTensor:
4820-
"""
4821-
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
4822-
4823-
Args:
4824-
logits: logits distribution shape (batch size, vocabulary size)
4825-
top_k (`int`, *optional*, defaults to 0):
4826-
If > 0, only keep the top k tokens with highest probability (top-k filtering)
4827-
top_p (`float`, *optional*, defaults to 1.0):
4828-
If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
4829-
filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
4830-
min_tokens_to_keep (`int`, *optional*, defaults to 1):
4831-
Minimumber of tokens we keep per batch example in the output.
4832-
4833-
From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
4834-
"""
4835-
warnings.warn(
4836-
"`top_k_top_p_filtering` is scheduled for deletion in v4.39. Use `TopKLogitsWarper` and `TopPLogitsWarper` "
4837-
"instead.",
4838-
DeprecationWarning,
4839-
)
4840-
4841-
if top_k > 0:
4842-
logits = TopKLogitsWarper(top_k=top_k, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(
4843-
None, logits
4844-
)
4845-
4846-
if 0 <= top_p <= 1.0:
4847-
logits = TopPLogitsWarper(top_p=top_p, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(
4848-
None, logits
4849-
)
4850-
4851-
return logits
4852-
4853-
48544813
def _ranking_fast(
48554814
context_hidden: torch.FloatTensor,
48564815
next_hidden: torch.FloatTensor,

src/transformers/models/llama/modeling_llama.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,7 @@ def cos_cached(self):
129129
return self._cos_cached
130130

131131
@torch.no_grad()
132-
def forward(self, x, position_ids, seq_len=None):
133-
if seq_len is not None:
134-
logger.warning_once("The `seq_len` argument is deprecated and unused. It will be removed in v4.39.")
135-
132+
def forward(self, x, position_ids):
136133
# x: [bs, num_attention_heads, seq_len, head_size]
137134
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
138135
position_ids_expanded = position_ids[:, None, :].float()
@@ -151,17 +148,17 @@ def forward(self, x, position_ids, seq_len=None):
151148
class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
152149
"""LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
153150

154-
def forward(self, x, position_ids, seq_len=None):
151+
def forward(self, x, position_ids):
155152
# difference to the original RoPE: a scaling factor is aplied to the position ids
156153
position_ids = position_ids.float() / self.scaling_factor
157-
cos, sin = super().forward(x, position_ids, seq_len)
154+
cos, sin = super().forward(x, position_ids)
158155
return cos, sin
159156

160157

161158
class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
162159
"""LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
163160

164-
def forward(self, x, position_ids, seq_len=None):
161+
def forward(self, x, position_ids):
165162
# difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
166163
seq_len = torch.max(position_ids) + 1
167164
if seq_len > self.max_position_embeddings:
@@ -173,7 +170,7 @@ def forward(self, x, position_ids, seq_len=None):
173170
)
174171
self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation
175172

176-
cos, sin = super().forward(x, position_ids, seq_len)
173+
cos, sin = super().forward(x, position_ids)
177174
return cos, sin
178175

179176

src/transformers/models/opt/modeling_opt.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -120,27 +120,10 @@ def __init__(
120120
):
121121
super().__init__()
122122
self.config = config
123-
124-
def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs):
125-
"""
126-
If a the deprecated argument `fn_arg_name` is passed, raise a deprecation
127-
warning and return that value, otherwise take the equivalent config.config_arg_name
128-
"""
129-
val = None
130-
if fn_arg_name in kwargs:
131-
logging.warning(
132-
"Passing in {fn_arg_name} to {self.__class__.__name__} is deprecated and won't be supported from "
133-
"v4.39. Please set it in the config instead"
134-
)
135-
val = kwargs.pop(fn_arg_name)
136-
else:
137-
val = getattr(config, config_arg_name)
138-
return val
139-
140-
self.embed_dim = _handle_deprecated_argument("hidden_size", config, "embed_dim", kwargs)
141-
self.num_heads = _handle_deprecated_argument("num_attention_heads", config, "num_heads", kwargs)
142-
self.dropout = _handle_deprecated_argument("attention_dropout", config, "dropout", kwargs)
143-
self.enable_bias = _handle_deprecated_argument("enable_bias", config, "bias", kwargs)
123+
self.embed_dim = config.hidden_size
124+
self.num_heads = config.num_attention_heads
125+
self.dropout = config.attention_dropout
126+
self.enable_bias = config.enable_bias
144127

145128
self.head_dim = self.embed_dim // self.num_heads
146129
self.is_causal = True

0 commit comments

Comments
 (0)