From 1b6a8af3ff00d4ff8a7782ad468ea0698e837906 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Sat, 5 Feb 2022 08:25:26 +0100 Subject: [PATCH 1/4] Add attribute_map --- src/transformers/models/swin/configuration_swin.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index 862c03f618773e..b53d93af2aedb2 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -90,6 +90,11 @@ class SwinConfig(PretrainedConfig): ```""" model_type = "swin" + attribute_map = { + "num_attention_heads": "num_heads", + "hidden_size": "embed_dim", + } + def __init__( self, image_size=224, From 5ac551e2f5ad56e447660dcd674211460aea0ad1 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Mon, 7 Feb 2022 16:03:37 +0100 Subject: [PATCH 2/4] Add mention in docs --- docs/source/model_doc/vision-encoder-decoder.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/model_doc/vision-encoder-decoder.mdx b/docs/source/model_doc/vision-encoder-decoder.mdx index d766075a7355b1..0a3e73c143695d 100644 --- a/docs/source/model_doc/vision-encoder-decoder.mdx +++ b/docs/source/model_doc/vision-encoder-decoder.mdx @@ -13,8 +13,8 @@ specific language governing permissions and limitations under the License. # Vision Encoder Decoder Models The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any -pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) -and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert)). +pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) +and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)). The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, From 6afff8a7835c3249d566e72d22716831433d43c2 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 8 Feb 2022 15:36:38 -0500 Subject: [PATCH 3/4] Set hidden_size attribute correctly --- src/transformers/models/swin/configuration_swin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index b53d93af2aedb2..7b70c2692631d8 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -92,7 +92,6 @@ class SwinConfig(PretrainedConfig): attribute_map = { "num_attention_heads": "num_heads", - "hidden_size": "embed_dim", } def __init__( @@ -135,3 +134,6 @@ def __init__( self.path_norm = patch_norm self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = embed_dim * 8 From 89789a956ae04367f667658af0ab34ed301a9c06 Mon Sep 17 00:00:00 2001 From: Niels Rogge Date: Tue, 8 Feb 2022 16:28:37 -0500 Subject: [PATCH 4/4] Add note about Transformer-based models only --- docs/source/model_doc/vision-encoder-decoder.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/model_doc/vision-encoder-decoder.mdx b/docs/source/model_doc/vision-encoder-decoder.mdx index 0a3e73c143695d..987924d4ad7c03 100644 --- a/docs/source/model_doc/vision-encoder-decoder.mdx +++ b/docs/source/model_doc/vision-encoder-decoder.mdx @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. # Vision Encoder Decoder Models The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any -pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) +pretrained Transformer-based vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)). The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for