mlfoundations · mehdidc · Jan 31, 2023 · Jan 6, 2023 · Jan 6, 2023 · Jan 31, 2023
diff --git a/src/open_clip/coca_model.py b/src/open_clip/coca_model.py
@@ -154,25 +154,37 @@ def encode_text(self, text, normalize: bool = True):
         text_latent, _ = self._encode_text(text, normalize=normalize)
         return text_latent
 
-    def forward(
-            self,
-            image,
-            text: Optional[torch.Tensor] = None,
-            image_latent: Optional[torch.Tensor] = None,
-            image_embs: Optional[torch.Tensor] = None,
-    ):
-        if image_latent is None or image_embs is None:
-            image_latent, image_embs = self._encode_image(image)
+    def forward(self, image=None, text=None, embed_cls=True, image_latent=None, image_embs=None, clamp_logit_scale_to=0):
+
+        if text is not None:
+            text_latent, token_embs = self._encode_text(text, embed_cls=embed_cls)
+        else:
+            text_latent, token_embs = None, None
+
+
+        if image is not None:
+            if image_latent is None or image_embs is None:
+                image_latent, image_embs = self._encode_image(image)
+        else:
+            image_latent = None
+            image_embs = None
 
         if text is None:
             return {"image_features": image_latent, "image_embs": image_embs}
 
         text_latent, token_embs = self._encode_text(text)
 
         # TODO: add assertion to avoid bugs?
-        labels = text[:, -token_embs.shape[1]:]
+        if text is not None and token_embs is not None:
+            labels = text[:, -token_embs.shape[1]:]
+            logits = self.text_decoder(image_embs, token_embs)
+        else:
+            labels = None
+            logits = None
 
-        logits = self.text_decoder(image_embs, token_embs)
+        if clamp_logit_scale_to:
+            with torch.no_grad():
+                self.logit_scale.data.clamp_(0, clamp_logit_scale_to)
         out_dict = {
             "image_features": image_latent,
             "text_features": text_latent,

diff --git a/src/open_clip/model.py b/src/open_clip/model.py
@@ -285,14 +285,12 @@ def encode_text(self, text, normalize: bool = False):
 
         return F.normalize(x, dim=-1) if normalize else x
 
-    def forward(
-            self,
-            image: Optional[torch.Tensor] = None,
-            text: Optional[torch.Tensor] = None,
-    ):
+    def forward(self, image=None, text=None, clamp_logit_scale_to:float=0):
         image_features = self.encode_image(image, normalize=True) if image is not None else None
         text_features = self.encode_text(text, normalize=True) if text is not None else None
-
+        if clamp_logit_scale_to:
+            with torch.no_grad():
+                self.logit_scale.data.clamp_(0, clamp_logit_scale_to)
         if self.output_dict:
             out_dict = {
                 "image_features": image_features,
@@ -358,10 +356,13 @@ def forward(
             self,
             image: Optional[torch.Tensor] = None,
             text: Optional[torch.Tensor] = None,
+            clamp_logit_scale_to: float = 0,
     ):
         image_features = self.encode_image(image, normalize=True) if image is not None else None
         text_features = self.encode_text(text, normalize=True) if text is not None else None
-
+        if clamp_logit_scale_to:
+            with torch.no_grad():
+                self.logit_scale.data.clamp_(0, clamp_logit_scale_to)
         if self.output_dict:
             out_dict = {
                 "image_features": image_features,

diff --git a/src/open_clip/zero_shot_classifier.py b/src/open_clip/zero_shot_classifier.py
@@ -53,7 +53,8 @@ def _process_batch(batch_classnames):
         num_batch_classes = len(batch_classnames)
         texts = [template.format(c) if use_format else template(c) for c in batch_classnames for template in templates]
         texts = tokenizer(texts).to(device)
-        class_embeddings = model.encode_text(texts, normalize=True)
+        output = model(text=texts)
+        class_embeddings = output['text_features'] if isinstance(output, dict) else output[1]
         class_embeddings = class_embeddings.reshape(num_batch_classes, num_templates, -1).mean(dim=1)
         class_embeddings = class_embeddings / class_embeddings.norm(dim=1, keepdim=True)
         class_embeddings = class_embeddings.T