From d14c03b40ea78e7b706099e8e1bbea6b48e733b0 Mon Sep 17 00:00:00 2001 From: Yosua Michael Maranatha Date: Thu, 5 May 2022 10:16:30 -0700 Subject: [PATCH] [fbsync] Adding the huge vision transformer from SWAG (#5721) Summary: * Add vit_b_16_swag * Better handling idiom for image_size, edit test_extended_model to handle case where number of param differ from default due to different image size input * Update the accuracy to the experiment result on torchvision model * Fix typo missing underscore * raise exception instead of torch._assert, add back publication year (accidentally deleted) * Add license information on meta and readme * Improve wording and fix typo for pretrained model license in readme * Add vit_l_16 weight * Update README.rst * Update the accuracy meta on vit_l_16_swag model to result from our experiment * Add vit_h_14_swag model * Add accuracy from experiments * Add to vit_h_16 model to hubconf.py * Add docs and expected pkl file for test * Remove legacy compatibility for ViT_H_14 model * Test vit_h_14 with smaller image_size to speedup the test (Note: this ignores all push blocking failures!) Reviewed By: jdsgomes, NicolasHug Differential Revision: D36095649 fbshipit-source-id: 639dab0577088e18e1bcfa06fd1f01be20c3fd44 Co-authored-by: Vasilis Vryniotis Co-authored-by: Vasilis Vryniotis Co-authored-by: Vasilis Vryniotis --- docs/source/models.rst | 3 ++ hubconf.py | 1 + .../ModelTester.test_vit_h_14_expect.pkl | Bin 0 -> 939 bytes test/test_models.py | 4 ++ torchvision/models/vision_transformer.py | 46 ++++++++++++++++++ 5 files changed, 54 insertions(+) create mode 100644 test/expect/ModelTester.test_vit_h_14_expect.pkl diff --git a/docs/source/models.rst b/docs/source/models.rst index 16825d2b8b2..f84d9c7fd1a 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -92,6 +92,7 @@ You can construct a model with random weights by calling its constructor: vit_b_32 = models.vit_b_32() vit_l_16 = models.vit_l_16() vit_l_32 = models.vit_l_32() + vit_h_14 = models.vit_h_14() convnext_tiny = models.convnext_tiny() convnext_small = models.convnext_small() convnext_base = models.convnext_base() @@ -213,6 +214,7 @@ vit_b_16 81.072 95.318 vit_b_32 75.912 92.466 vit_l_16 79.662 94.638 vit_l_32 76.972 93.070 +vit_h_14 88.552 98.694 convnext_tiny 82.520 96.146 convnext_small 83.616 96.650 convnext_base 84.062 96.870 @@ -434,6 +436,7 @@ VisionTransformer vit_b_32 vit_l_16 vit_l_32 + vit_h_14 ConvNeXt -------- diff --git a/hubconf.py b/hubconf.py index c3de4f2da9a..bbd5da52b13 100644 --- a/hubconf.py +++ b/hubconf.py @@ -67,4 +67,5 @@ vit_b_32, vit_l_16, vit_l_32, + vit_h_14, ) diff --git a/test/expect/ModelTester.test_vit_h_14_expect.pkl b/test/expect/ModelTester.test_vit_h_14_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1f846beb6a0bccf8b545f5a67b74482015cc878b GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5(jATumGUYDqB%_@&wQ~AdEY-_!+F>p;eYzR1Ay-Hz#ul>i!MRpZGie3qz3t@Vp zVG!WW#-;;RB*&}^R}M VisionTransformer: """ @@ -531,6 +554,29 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru ) +def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer: + """ + Constructs a vit_h_14 architecture from + `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" `_. + + Args: + weights (ViT_H_14_Weights, optional): The pretrained weights for the model + progress (bool): If True, displays a progress bar of the download to stderr + """ + weights = ViT_H_14_Weights.verify(weights) + + return _vision_transformer( + patch_size=14, + num_layers=32, + num_heads=16, + hidden_dim=1280, + mlp_dim=5120, + weights=weights, + progress=progress, + **kwargs, + ) + + def interpolate_embeddings( image_size: int, patch_size: int,