Disable Attention Pooling by default because of its input size restrictions

ProGamerGov · web-flow · commit 981c7634e18b · 2022-06-02T10:24:58.000-06:00
diff --git a/captum/optim/models/_image/clip_resnet50x4_image.py b/captum/optim/models/_image/clip_resnet50x4_image.py
@@ -23,8 +23,7 @@ def clip_resnet50x4_image(
     This model can be combined with the CLIP ResNet 50x4 Text model to create the full
     CLIP ResNet 50x4 model.
 
-    Note that model inputs are expected to have a shape of: [B, 3, 288, 288] or
-    [3, 288, 288].
+    Note that the model was trained on inputs with a shape of: [B, 3, 288, 288].
 
     See here for more details:
     https://github.com/openai/CLIP
@@ -48,6 +47,10 @@ def clip_resnet50x4_image(
         transform_input (bool, optional): If True, preprocesses the input according to
             the method with which it was trained.
             Default: *True* when pretrained is True otherwise *False*
+        use_attnpool (bool, optional): Whether or not to use the final AttentionPool2d
+            layer in the forward function. If set to True, model inputs are required
+            to have a shape of: [B, 3, 288, 288] or [3, 288, 288].
+            Default: False
 
     Returns:
         **CLIP_ResNet50x4Image** (CLIP_ResNet50x4Image): A CLIP ResNet 50x4 model's
@@ -60,6 +63,8 @@ def clip_resnet50x4_image(
             kwargs["replace_relus_with_redirectedrelu"] = True
         if "use_linear_modules_only" not in kwargs:
             kwargs["use_linear_modules_only"] = False
+        if "use_attnpool" not in kwargs:
+            kwargs["use_attnpool"] = False
 
         model = CLIP_ResNet50x4Image(**kwargs)
 
@@ -81,13 +86,14 @@ class CLIP_ResNet50x4Image(nn.Module):
     Visual Models From Natural Language Supervision': https://arxiv.org/abs/2103.00020
     """
 
-    __constants__ = ["transform_input"]
+    __constants__ = ["transform_input", "use_attnpool"]
 
     def __init__(
         self,
         transform_input: bool = False,
         replace_relus_with_redirectedrelu: bool = False,
         use_linear_modules_only: bool = False,
+        use_attnpool: bool = True,
     ) -> None:
         """
         Args:
@@ -101,6 +107,11 @@ def __init__(
             transform_input (bool, optional): If True, preprocesses the input according
                 to the method with which it was trained on.
                 Default: False
+            use_attnpool (bool, optional): Whether or not to use the final
+                AttentionPool2d layer in the forward function. If set to True, model
+                inputs are required to have a shape of: [B, 3, 288, 288] or
+                [3, 288, 288].
+                Default: True
         """
         super().__init__()
         if use_linear_modules_only:
@@ -112,6 +123,7 @@ def __init__(
                 activ = nn.ReLU
 
         self.transform_input = transform_input
+        self.use_attnpool = use_attnpool
 
         # Stem layers
         self.conv1 = nn.Conv2d(3, 40, kernel_size=3, stride=2, padding=1, bias=False)
@@ -216,7 +228,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.layer4(x)
 
         # Attention Pooling
-        x = self.attnpool(x)
+        if self.use_attnpool:
+            x = self.attnpool(x)
         return x
 
 
diff --git a/tests/optim/models/test_clip_resnet50x4_image.py b/tests/optim/models/test_clip_resnet50x4_image.py
@@ -81,9 +81,10 @@ def test_clip_resnet50x4_image_load_and_forward(self) -> None:
                 + " insufficient Torch version."
             )
         x = torch.zeros(1, 3, 288, 288)
-        model = clip_resnet50x4_image(pretrained=True)
+        model = clip_resnet50x4_image(pretrained=True, use_attnpool=True)
         output = model(x)
         self.assertEqual(list(output.shape), [1, 640])
+        self.assertTrue(model.use_attnpool)
 
     def test_untrained_clip_resnet50x4_image_load_and_forward(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.6.0"):
@@ -92,9 +93,10 @@ def test_untrained_clip_resnet50x4_image_load_and_forward(self) -> None:
                 + " insufficient Torch version."
             )
         x = torch.zeros(1, 3, 288, 288)
-        model = clip_resnet50x4_image(pretrained=False)
+        model = clip_resnet50x4_image(pretrained=False, use_attnpool=True)
         output = model(x)
         self.assertEqual(list(output.shape), [1, 640])
+        self.assertTrue(model.use_attnpool)
 
     def test_clip_resnet50x4_image_warning(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.6.0"):
@@ -109,6 +111,30 @@ def test_clip_resnet50x4_image_warning(self) -> None:
         with self.assertWarns(UserWarning):
             _ = model._transform_input(x)
 
+    def test_clip_resnet50x4_image_use_attnpool_false(self) -> None:
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
+            raise unittest.SkipTest(
+                "Skipping basic pretrained CLIP ResNet 50x4 Image use_attnpool"
+                + " forward due to insufficient Torch version."
+            )
+        x = torch.zeros(1, 3, 288, 288)
+        model = clip_resnet50x4_image(pretrained=True, use_attnpool=False)
+        output = model(x)
+        self.assertEqual(list(output.shape), [1, 2560, 9, 9])
+        self.assertFalse(model.use_attnpool)
+
+    def test_clip_resnet50x4_image_use_attnpool_false_size_128(self) -> None:
+        if version.parse(torch.__version__) <= version.parse("1.6.0"):
+            raise unittest.SkipTest(
+                "Skipping basic pretrained CLIP ResNet 50x4 Image use_attnpool"
+                + " forward with 128x128 input due to insufficient Torch version."
+            )
+        x = torch.zeros(1, 3, 128, 128)
+        model = clip_resnet50x4_image(pretrained=True, use_attnpool=False)
+        output = model(x)
+        self.assertEqual(list(output.shape), [1, 2560, 4, 4])
+        self.assertFalse(model.use_attnpool)
+
     def test_clip_resnet50x4_image_forward_cuda(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.6.0"):
             raise unittest.SkipTest(
@@ -121,11 +147,12 @@ def test_clip_resnet50x4_image_forward_cuda(self) -> None:
                 + " not supporting CUDA."
             )
         x = torch.zeros(1, 3, 288, 288).cuda()
-        model = clip_resnet50x4_image(pretrained=True).cuda()
+        model = clip_resnet50x4_image(pretrained=True, use_attnpool=True).cuda()
         output = model(x)
 
         self.assertTrue(output.is_cuda)
         self.assertEqual(list(output.shape), [1, 640])
+        self.assertTrue(model.use_attnpool)
 
     def test_clip_resnet50x4_image_jit_module_no_redirected_relu(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.8.0"):
@@ -135,11 +162,12 @@ def test_clip_resnet50x4_image_jit_module_no_redirected_relu(self) -> None:
             )
         x = torch.zeros(1, 3, 288, 288)
         model = clip_resnet50x4_image(
-            pretrained=True, replace_relus_with_redirectedrelu=False
+            pretrained=True, replace_relus_with_redirectedrelu=False, use_attnpool=True
         )
         jit_model = torch.jit.script(model)
         output = jit_model(x)
         self.assertEqual(list(output.shape), [1, 640])
+        self.assertTrue(model.use_attnpool)
 
     def test_clip_resnet50x4_image_jit_module_with_redirected_relu(self) -> None:
         if version.parse(torch.__version__) <= version.parse("1.8.0"):
@@ -149,8 +177,9 @@ def test_clip_resnet50x4_image_jit_module_with_redirected_relu(self) -> None:
             )
         x = torch.zeros(1, 3, 288, 288)
         model = clip_resnet50x4_image(
-            pretrained=True, replace_relus_with_redirectedrelu=True
+            pretrained=True, replace_relus_with_redirectedrelu=True, use_attnpool=True
         )
         jit_model = torch.jit.script(model)
         output = jit_model(x)
         self.assertEqual(list(output.shape), [1, 640])
+        self.assertTrue(model.use_attnpool)