diff --git a/docs/source/model_doc/vision-encoder-decoder.mdx b/docs/source/model_doc/vision-encoder-decoder.mdx index d766075a7355b1..987924d4ad7c03 100644 --- a/docs/source/model_doc/vision-encoder-decoder.mdx +++ b/docs/source/model_doc/vision-encoder-decoder.mdx @@ -13,8 +13,8 @@ specific language governing permissions and limitations under the License. # Vision Encoder Decoder Models The [`VisionEncoderDecoderModel`] can be used to initialize an image-to-text-sequence model with any -pretrained vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit)) -and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert)). +pretrained Transformer-based vision autoencoding model as the encoder (*e.g.* [ViT](vit), [BEiT](beit), [DeiT](deit), [Swin](swin)) +and any pretrained language model as the decoder (*e.g.* [RoBERTa](roberta), [GPT2](gpt2), [BERT](bert), [DistilBERT](distilbert)). The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for example) [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index 862c03f618773e..7b70c2692631d8 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -90,6 +90,10 @@ class SwinConfig(PretrainedConfig): ```""" model_type = "swin" + attribute_map = { + "num_attention_heads": "num_heads", + } + def __init__( self, image_size=224, @@ -130,3 +134,6 @@ def __init__( self.path_norm = patch_norm self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = embed_dim * 8