diff --git a/src/transformers/adapters/utils.py b/src/transformers/adapters/utils.py index 8b02b46b40..cc93c68ea2 100644 --- a/src/transformers/adapters/utils.py +++ b/src/transformers/adapters/utils.py @@ -722,8 +722,9 @@ def resolve_adapter_path( except Exception as ex: logger.info(ex) raise EnvironmentError( - "Unable to load adapter {} from any source. Please check the name of the adapter or the source." - .format(adapter_name_or_path) + "Unable to load adapter {} from any source. Please check the name of the adapter or the source.".format( + adapter_name_or_path + ) ) else: raise ValueError("Unable to identify {} as a valid module location.".format(adapter_name_or_path)) diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py index f1df324eb6..f2c8b2cd42 100644 --- a/src/transformers/commands/add_new_model_like.py +++ b/src/transformers/commands/add_new_model_like.py @@ -438,9 +438,14 @@ def duplicate_module( # Special cases if "PRETRAINED_CONFIG_ARCHIVE_MAP = {" in obj: # docstyle-ignore - obj = f"{new_model_patterns.model_upper_cased}_PRETRAINED_CONFIG_ARCHIVE_MAP = " + "{" + f""" + obj = ( + f"{new_model_patterns.model_upper_cased}_PRETRAINED_CONFIG_ARCHIVE_MAP = " + + "{" + + f""" "{new_model_patterns.checkpoint}": "https://huggingface.co/{new_model_patterns.checkpoint}/resolve/main/config.json", -""" + "}\n" +""" + + "}\n" + ) new_objects.append(obj) continue elif "PRETRAINED_MODEL_ARCHIVE_LIST = [" in obj: diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index 0e0f0bc368..944b88afba 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -1046,11 +1046,13 @@ def __del__(self): class NeptuneMissingConfiguration(Exception): def __init__(self): - super().__init__(""" + super().__init__( + """ ------ Unsupported ---- We were not able to create new runs. You provided a custom Neptune run to `NeptuneCallback` with the `run` argument. For the integration to work fully, provide your `api_token` and `project` by saving them as environment variables or passing them to the callback. - """) + """ + ) class NeptuneCallback(TrainerCallback): diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 50a14f7d0f..66aa8b86de 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2423,11 +2423,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P key: device_map[key] for key in device_map.keys() if key not in modules_to_not_convert } if "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values(): - raise ValueError(""" + raise ValueError( + """ Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you have set a value for `max_memory` you should increase that. To have an idea of the modules that are set on the CPU or RAM you can print model.hf_device_map. - """) + """ + ) del device_map_without_lm_head if from_tf: diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index f750bf80ee..cdb9e787b7 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -927,9 +927,11 @@ def bigbird_block_sparse_attention( attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ :, :, :, :to_block_size ] # 1st key block (global) - attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( - second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] - ) # last three blocks (global + sliding) + attention_probs[ + :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size : + ] = second_last_attn_weights[ + :, :, :, to_block_size : 4 * to_block_size + ] # last three blocks (global + sliding) # random keys for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 5236564760..bc202956d5 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -745,9 +745,11 @@ def bigbird_block_sparse_attention( attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ :, :, :, :to_block_size ] # 1st key block (global) - attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( - second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] - ) # last three blocks (global + sliding) + attention_probs[ + :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size : + ] = second_last_attn_weights[ + :, :, :, to_block_size : 4 * to_block_size + ] # last three blocks (global + sliding) # random keys for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index c5970f9333..6a3e81e25e 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -88,72 +88,72 @@ def load_tf_weights_in_convbert(model, config, tf_checkpoint_path): group_dense_name = "dense" for j in range(config.num_hidden_layers): - param_mapping[f"encoder.layer.{j}.attention.self.query.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/query/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.query.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/query/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.self.key.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/key/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.key.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/key/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.self.value.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/value/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.value.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/value/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.weight"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.bias"] = ( - f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.output.dense.weight"] = ( - f"electra/encoder/layer_{j}/attention/output/dense/kernel" - ) - param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.weight"] = ( - f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma" - ) - param_mapping[f"encoder.layer.{j}.attention.output.dense.bias"] = ( - f"electra/encoder/layer_{j}/attention/output/dense/bias" - ) - param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.bias"] = ( - f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta" - ) - param_mapping[f"encoder.layer.{j}.intermediate.dense.weight"] = ( - f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel" - ) - param_mapping[f"encoder.layer.{j}.intermediate.dense.bias"] = ( - f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias" - ) - param_mapping[f"encoder.layer.{j}.output.dense.weight"] = ( - f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel" - ) - param_mapping[f"encoder.layer.{j}.output.dense.bias"] = ( - f"electra/encoder/layer_{j}/output/{group_dense_name}/bias" - ) - param_mapping[f"encoder.layer.{j}.output.LayerNorm.weight"] = ( - f"electra/encoder/layer_{j}/output/LayerNorm/gamma" - ) + param_mapping[ + f"encoder.layer.{j}.attention.self.query.weight" + ] = f"electra/encoder/layer_{j}/attention/self/query/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.query.bias" + ] = f"electra/encoder/layer_{j}/attention/self/query/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.key.weight" + ] = f"electra/encoder/layer_{j}/attention/self/key/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key.bias" + ] = f"electra/encoder/layer_{j}/attention/self/key/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.value.weight" + ] = f"electra/encoder/layer_{j}/attention/self/value/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.value.bias" + ] = f"electra/encoder/layer_{j}/attention/self/value/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_out_layer.weight" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.self.conv_out_layer.bias" + ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias" + param_mapping[ + f"encoder.layer.{j}.attention.output.dense.weight" + ] = f"electra/encoder/layer_{j}/attention/output/dense/kernel" + param_mapping[ + f"encoder.layer.{j}.attention.output.LayerNorm.weight" + ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma" + param_mapping[ + f"encoder.layer.{j}.attention.output.dense.bias" + ] = f"electra/encoder/layer_{j}/attention/output/dense/bias" + param_mapping[ + f"encoder.layer.{j}.attention.output.LayerNorm.bias" + ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta" + param_mapping[ + f"encoder.layer.{j}.intermediate.dense.weight" + ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel" + param_mapping[ + f"encoder.layer.{j}.intermediate.dense.bias" + ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias" + param_mapping[ + f"encoder.layer.{j}.output.dense.weight" + ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel" + param_mapping[ + f"encoder.layer.{j}.output.dense.bias" + ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/bias" + param_mapping[ + f"encoder.layer.{j}.output.LayerNorm.weight" + ] = f"electra/encoder/layer_{j}/output/LayerNorm/gamma" param_mapping[f"encoder.layer.{j}.output.LayerNorm.bias"] = f"electra/encoder/layer_{j}/output/LayerNorm/beta" for param in model.named_parameters(): diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py index 28d763bc16..507f10cb77 100644 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ b/src/transformers/models/donut/convert_donut_to_pytorch.py @@ -106,22 +106,22 @@ def convert_state_dict(orig_state_dict, model): orig_state_dict[ f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" ] = val[:dim, :] - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight" + ] = val[dim : dim * 2, :] orig_state_dict[ f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" ] = val[-dim:, :] else: - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = ( - val[dim : dim * 2] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias" + ] = val[:dim] + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias" + ] = val[dim : dim * 2] + orig_state_dict[ + f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias" + ] = val[-dim:] elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]: # HuggingFace implementation doesn't use attn_mask buffer # and model doesn't use final LayerNorms for the encoder diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index a0f1e6b503..2074e36754 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -775,11 +775,14 @@ def forward(self, hidden_states: torch.Tensor): FLAVA_TEXT_INPUTS_DOCSTRING = FLAVA_TEXT_INPUTS_DOCSTRING_BASE + FLAVA_INPUTS_DOCSTRING_COMMON -FLAVA_MULTIMODAL_INPUTS_DOCSTRING = r""" +FLAVA_MULTIMODAL_INPUTS_DOCSTRING = ( + r""" Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`): The concatenated hidden states of unimodal encoders. -""" + FLAVA_INPUTS_DOCSTRING_COMMON +""" + + FLAVA_INPUTS_DOCSTRING_COMMON +) FLAVA_MODEL_INPUTS_DOCSTRING_BASE = r""" Args: @@ -1260,7 +1263,9 @@ def get_text_features( ... text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt" ... ) >>> text_features = model.get_text_features(**inputs) - ```""".format(_CHECKPOINT_FOR_DOC) + ```""".format( + _CHECKPOINT_FOR_DOC + ) text_outputs = self.text_model( input_ids=input_ids, attention_mask=attention_mask, @@ -1309,7 +1314,9 @@ def get_image_features( >>> inputs = processor(images=image, return_tensors="pt") >>> image_features = model.get_image_features(**inputs) - ```""".format(_CHECKPOINT_FOR_DOC) + ```""".format( + _CHECKPOINT_FOR_DOC + ) image_outputs = self.image_model( pixel_values=pixel_values, bool_masked_pos=bool_masked_pos, @@ -1574,7 +1581,9 @@ def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor: >>> outputs = model.get_codebook_indices(**inputs) ``` - """.format(_CHECKPOINT_FOR_CODEBOOK_DOC) + """.format( + _CHECKPOINT_FOR_CODEBOOK_DOC + ) z_logits = self.blocks(pixel_values) return torch.argmax(z_logits, axis=1) @@ -1609,7 +1618,9 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: >>> print(outputs.shape) (1, 196) ``` - """.format(_CHECKPOINT_FOR_CODEBOOK_DOC) + """.format( + _CHECKPOINT_FOR_CODEBOOK_DOC + ) if len(pixel_values.shape) != 4: raise ValueError(f"input shape {pixel_values.shape} is not 4d") if pixel_values.shape[1] != self.input_channels: diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index 0d676dae6e..2b72053863 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -1213,8 +1213,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index 270f7cd77d..314d50ef9a 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -1345,8 +1345,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py index 38327ec898..c175144623 100644 --- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py @@ -236,14 +236,24 @@ def write_model_card(self, model_dict, dry_run=False) -> str: * OPUS readme: [README.md]({readme_url}) """ - content = f""" + content = ( + f""" * model: {model_dict['modeltype']} * source language code{src_multilingual*'s'}: {', '.join(a2_src_tags)} * target language code{tgt_multilingual*'s'}: {', '.join(a2_tgt_tags)} * dataset: opus {backtranslated_data} * release date: {model_dict['release-date']} * pre-processing: {model_dict['pre-processing']} -""" + multilingual_data + tuned + download + langtoken + datainfo + testset + testscores + scorestable +""" + + multilingual_data + + tuned + + download + + langtoken + + datainfo + + testset + + testscores + + scorestable + ) content = FRONT_MATTER_TEMPLATE.format(lang_tags) + extra_markdown + content diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index f7d0e445d0..857df7ea01 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -1315,8 +1315,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py index 93184e8ef7..b76e68f906 100755 --- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py @@ -143,29 +143,29 @@ def ema(x): tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_pool.normalization.bias tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_pool.normalization.weight tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_pool.normalization.running_mean - tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = ( - model.segmentation_head.conv_pool.normalization.running_var - ) + tf_to_pt_map[ + prefix + "BatchNorm/moving_variance" + ] = model.segmentation_head.conv_pool.normalization.running_var prefix = "aspp0/" tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_aspp.convolution.weight tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_aspp.normalization.bias tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_aspp.normalization.weight tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_aspp.normalization.running_mean - tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = ( - model.segmentation_head.conv_aspp.normalization.running_var - ) + tf_to_pt_map[ + prefix + "BatchNorm/moving_variance" + ] = model.segmentation_head.conv_aspp.normalization.running_var prefix = "concat_projection/" tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_projection.convolution.weight tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_projection.normalization.bias tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_projection.normalization.weight - tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = ( - model.segmentation_head.conv_projection.normalization.running_mean - ) - tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = ( - model.segmentation_head.conv_projection.normalization.running_var - ) + tf_to_pt_map[ + prefix + "BatchNorm/moving_mean" + ] = model.segmentation_head.conv_projection.normalization.running_mean + tf_to_pt_map[ + prefix + "BatchNorm/moving_variance" + ] = model.segmentation_head.conv_projection.normalization.running_var prefix = "logits/semantic/" tf_to_pt_map[ema(prefix + "weights")] = model.segmentation_head.classifier.convolution.weight diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py index 09eb623614..302bd6f3f7 100644 --- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py +++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py @@ -95,15 +95,15 @@ def convert_state_dict(orig_state_dict, model): dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size if "weight" in key: - orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = ( - val[:dim, :] - ) + orig_state_dict[ + f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" + ] = val[:dim, :] orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[ dim : dim * 2, : ] - orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = ( - val[-dim:, :] - ) + orig_state_dict[ + f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" + ] = val[-dim:, :] else: orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[ :dim diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py index f71ce358d1..860fdd1b54 100644 --- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py +++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py @@ -102,15 +102,15 @@ def convert_state_dict(orig_state_dict, model): dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size if "weight" in key: - orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = ( - val[:dim, :] - ) + orig_state_dict[ + f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" + ] = val[:dim, :] orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[ dim : dim * 2, : ] - orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = ( - val[-dim:, :] - ) + orig_state_dict[ + f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" + ] = val[-dim:, :] else: orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[ :dim diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py index ae06f7fa86..38a11496f7 100644 --- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py +++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py @@ -137,22 +137,22 @@ def convert_state_dict(orig_state_dict, config): orig_state_dict[ f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight" ] = val[:dim, :] - orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) + orig_state_dict[ + f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight" + ] = val[dim : dim * 2, :] orig_state_dict[ f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight" ] = val[-dim:, :] else: - orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) - orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"] = ( - val[dim : dim * 2] - ) - orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) + orig_state_dict[ + f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias" + ] = val[:dim] + orig_state_dict[ + f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias" + ] = val[dim : dim * 2] + orig_state_dict[ + f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias" + ] = val[-dim:] pass else: orig_state_dict[rename_key(key, config)] = val diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py index f0751eafa8..7af3bfb86c 100644 --- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py +++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py @@ -145,22 +145,22 @@ def convert_state_dict(orig_state_dict, model): orig_state_dict[ f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" ] = val[:dim, :] - orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) + orig_state_dict[ + f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight" + ] = val[dim : dim * 2, :] orig_state_dict[ f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" ] = val[-dim:, :] else: - orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) + orig_state_dict[ + f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias" + ] = val[:dim] orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[ dim : dim * 2 ] - orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) + orig_state_dict[ + f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias" + ] = val[-dim:] else: orig_state_dict[rename_key(key)] = val diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 08f5cf9e78..4cfed3a2ef 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -3274,8 +3274,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index bcbe808013..b5d23417ce 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -33,14 +33,12 @@ def assert_device_map(device_map, num_blocks): if len(duplicate_blocks) != 0: raise ValueError( "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device." - " These attention blocks were specified more than once: " - + str(duplicate_blocks) + " These attention blocks were specified more than once: " + str(duplicate_blocks) ) if len(missing_blocks) != 0: raise ValueError( "There are attention blocks for this model that are not specified in the device_map. Add these attention " - "blocks to a device on the device_map: " - + str(missing_blocks) + "blocks to a device on the device_map: " + str(missing_blocks) ) if len(extra_blocks) != 0: raise ValueError( diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 481e567e70..0291873572 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -1024,21 +1024,25 @@ def run_trainer( actions = 0 if do_train: actions += 1 - args.extend(f""" + args.extend( + f""" --do_train --num_train_epochs {str(num_train_epochs)} --max_train_samples 16 --per_device_train_batch_size 2 --learning_rate 3e-3 - """.split()) + """.split() + ) if do_eval: actions += 1 - args.extend(""" + args.extend( + """ --do_eval --max_eval_samples 16 --per_device_eval_batch_size 2 - """.split()) + """.split() + ) assert actions > 0, "need at least do_train or do_eval for the test to run" diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 54180b0112..0ae5ed4c82 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -523,9 +523,7 @@ def test_inference(self): processor = AltCLIPProcessor.from_pretrained(model_name) image = prepare_img() - inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to( - torch_device - ) + inputs = processor(text=["一张猫的照片", "一张狗的照片"], images=image, padding=True, return_tensors="pt").to(torch_device) # forward pass with torch.no_grad(): diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 21b04c18c3..a9c983bab3 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -351,18 +351,14 @@ def test_wordpiece_tokenizer(self): self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"]) - self.assertListEqual( - tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"] - ) + self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) def test_sentencepiece_tokenizer(self): tokenizer = BertJapaneseTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese-with-auto-jumanpp") subword_tokenizer = tokenizer.subword_tokenizer tokens = subword_tokenizer.tokenize("国境 の 長い トンネル を 抜ける と 雪国 であった 。") - self.assertListEqual( - tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"] - ) + self.assertListEqual(tokens, ["▁国境", "▁の", "▁長い", "▁トンネル", "▁を", "▁抜ける", "▁と", "▁雪", "国", "▁であった", "▁。"]) tokens = subword_tokenizer.tokenize("こんばんは こんばん にち は こんにちは") self.assertListEqual(tokens, ["▁こん", "ばん", "は", "▁こん", "ばん", "▁に", "ち", "▁は", "▁こんにちは"]) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index b554243f01..12ab3e5d1d 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -677,9 +677,9 @@ def test_inference(self): processor = ChineseCLIPProcessor.from_pretrained(model_name) image = prepare_img() - inputs = processor( - text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt" - ).to(torch_device) + inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, padding=True, return_tensors="pt").to( + torch_device + ) # forward pass with torch.no_grad(): diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index a49dae8237..30e44d4d48 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -317,18 +317,24 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - @unittest.skip(reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic - hidden states""") + @unittest.skip( + reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic + hidden states""" + ) def test_save_load(self): pass - @unittest.skip(reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic - hidden states""") + @unittest.skip( + reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic + hidden states""" + ) def test_determinism(self): pass - @unittest.skip(reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic - hidden states""") + @unittest.skip( + reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic + hidden states""" + ) def test_model_outputs_equivalence(self): pass diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py index 65624ba069..8c19c01491 100644 --- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py @@ -439,8 +439,10 @@ def test_save_load_config(self): self.assert_outputs_same(after_outputs, outputs) - @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load - to get deterministic results.""") + @unittest.skip( + reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load + to get deterministic results.""" + ) def test_determinism(self): pass diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 9ffa76bba6..35693da4cf 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -252,18 +252,24 @@ def test_save_load(self): max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) - @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load - to get deterministic results.""") + @unittest.skip( + reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load + to get deterministic results.""" + ) def test_determinism(self): pass - @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load - to get deterministic results.""") + @unittest.skip( + reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load + to get deterministic results.""" + ) def test_save_load_fast_init_from_base(self): pass - @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load - to get deterministic results.""") + @unittest.skip( + reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load + to get deterministic results.""" + ) def test_save_load_fast_init_to_base(self): pass diff --git a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py index 2cda0acf10..cdf962d83d 100644 --- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py +++ b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py @@ -115,8 +115,7 @@ def test_xprophetnet_ntg_inference(self): " к кибератакам." ) ZH_SENTENCE = ( - "根据该组织的官方门户网站,微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。" - "从那时起,该系统的用户将无法接收安全更新,这可能会使他们的计算机容易受到网络攻击。" + "根据该组织的官方门户网站,微软公司打算在2020年1月14日之后正式终止对Windows 7操作系统的免费支持。" "从那时起,该系统的用户将无法接收安全更新,这可能会使他们的计算机容易受到网络攻击。" ) input_ids = tokenizer( @@ -143,9 +142,8 @@ def test_xprophetnet_ntg_inference(self): tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1 ] EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ") - EXPECTED_TITLE_RU_BEAM1_TOK = ( - "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года" - .split(" ") + EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split( + " " ) EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ") self.assertListEqual( diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py index 9d7ada7b7a..39ad2175dc 100644 --- a/tests/pipelines/test_pipelines_conversational.py +++ b/tests/pipelines/test_pipelines_conversational.py @@ -41,12 +41,16 @@ class ConversationalPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseM model_mapping = dict( list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(MODEL_FOR_CAUSAL_LM_MAPPING.items()) if MODEL_FOR_CAUSAL_LM_MAPPING else [] + else [] + list(MODEL_FOR_CAUSAL_LM_MAPPING.items()) + if MODEL_FOR_CAUSAL_LM_MAPPING + else [] ) tf_model_mapping = dict( list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) if TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.items()) if TF_MODEL_FOR_CAUSAL_LM_MAPPING else [] + else [] + list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.items()) + if TF_MODEL_FOR_CAUSAL_LM_MAPPING + else [] ) def get_test_pipeline(self, model, tokenizer, feature_extractor): diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py index 57aba7f35a..28cde51a8e 100644 --- a/tests/pipelines/test_pipelines_feature_extraction.py +++ b/tests/pipelines/test_pipelines_feature_extraction.py @@ -187,11 +187,13 @@ def get_test_pipeline(self, model, tokenizer, feature_extractor): self.skipTest("This is a bimodal model, we need to find a more consistent way to switch on those models.") return elif model.config.is_encoder_decoder: - self.skipTest("""encoder_decoder models are trickier for this pipeline. + self.skipTest( + """encoder_decoder models are trickier for this pipeline. Do we want encoder + decoder inputs to get some featues? Do we want encoder only features ? For now ignore those. - """) + """ + ) return feature_extractor = FeatureExtractionPipeline( diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py index 2623addca0..e4bced94fd 100644 --- a/tests/pipelines/test_pipelines_question_answering.py +++ b/tests/pipelines/test_pipelines_question_answering.py @@ -179,9 +179,7 @@ def test_small_model_japanese(self): "question-answering", model="KoichiYasuoka/deberta-base-japanese-aozora-ud-head", ) - output = question_answerer( - question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている" - ) + output = question_answerer(question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている") # Wrong answer, the whole text is identified as one "word" since the tokenizer does not include # a pretokenizer