diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/imagenet.py b/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/imagenet.py
index 529afd337b6..0cd73a08140 100644
--- a/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/imagenet.py
+++ b/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/imagenet.py
@@ -16,16 +16,18 @@
 
 from pathlib import Path
 
-from ..config import PathField, BoolField
+from ..config import PathField, BoolField, StringField
 from ..representation import ClassificationAnnotation
 from ..utils import read_txt, get_path, check_file_existence, read_json
 
 from .format_converter import BaseFormatConverter, ConverterReturn, verify_label_map
+from ._nlp_common import get_tokenizer
 
 
 class ImageNetFormatConverter(BaseFormatConverter):
     __provider__ = 'imagenet'
     annotation_types = (ClassificationAnnotation, )
+    max_seq_length = 128
 
     @classmethod
     def parameters(cls):
@@ -47,6 +49,15 @@ def parameters(cls):
             ),
             'dataset_meta_file': PathField(
                 description='path to json file with dataset meta (e.g. label_map, color_encoding)', optional=True
+            ),
+            'prepare_input_ids_from_labels': BoolField(
+                optional=True, default=False,
+                description="Convert label strings into image captions list. It's required for CLIP models"
+            ),
+            'lower_case': BoolField(optional=True, default=False, description='Switch tokens to lower case register'),
+            'model_id': StringField(
+                optional=True,
+                description='The model id of a predefined tokenizer hosted inside a model repo on huggingface.co'
             )
         })
         return configuration_parameters
@@ -57,6 +68,10 @@ def configure(self):
         self.has_background = self.get_value_from_config('has_background')
         self.images_dir = self.get_value_from_config('images_dir') or self.annotation_file.parent
         self.dataset_meta = self.get_value_from_config('dataset_meta_file')
+        self.prepare_input_ids_from_labels = self.get_value_from_config('prepare_input_ids_from_labels')
+        self.lower_case = self.get_value_from_config('lower_case')
+        self.model_id = self.get_value_from_config('model_id')
+        self.tokenizer, self.external_tok = get_tokenizer(self.config, self.lower_case)
 
     def convert(self, check_content=False, progress_callback=None, progress_interval=100, **kwargs):
         annotation = []
@@ -78,7 +93,23 @@ def convert(self, check_content=False, progress_callback=None, progress_interval
         return ConverterReturn(annotation, self.get_meta(), content_errors)
 
     @staticmethod
-    def _create_meta(labels_file, dataset_meta, has_background=False):
+    def _create_captions(label_map, tokenizer):
+        tokenized_captions = []
+        input_masks = []
+        for label in label_map.values():
+            first_label = label.split(',')[0]
+            caption = f"This is a picture of {first_label}."
+            tokens = tokenizer.tokenize(caption, add_special_tokens=True)
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            if len(tokens) > ImageNetFormatConverter.max_seq_length:
+                tokens = tokens[:ImageNetFormatConverter.max_seq_length]
+            input_mask = [1] * len(input_ids)
+            tokenized_captions.append(input_ids)
+            input_masks.append(input_mask)
+        return tokenized_captions, input_masks
+
+    @staticmethod
+    def _create_meta(labels_file, dataset_meta, tokenizer, has_background=False, prepare_input_ids_from_labels=False, ):
         meta = {}
         label_map = {}
         if dataset_meta:
@@ -106,8 +137,12 @@ def _create_meta(labels_file, dataset_meta, has_background=False):
             label_map[0] = 'background'
             meta['background_label'] = 0
 
+        if prepare_input_ids_from_labels:
+            (captions, masks) = ImageNetFormatConverter._create_captions(label_map, tokenizer)
+            meta['input_ids'] = captions
+            meta['input_masks'] = masks
         return meta
 
     def get_meta(self):
-        meta = self._create_meta(self.labels_file, self.dataset_meta, self.has_background) or None
+        meta = self._create_meta(self.labels_file, self.dataset_meta, self.tokenizer, self.has_background, self.prepare_input_ids_from_labels ) or None
         return meta