add model_type in model registry

kushal-10 · Apr 28, 2024 · d9002ea · d9002ea
1 parent ea68246
commit d9002ea
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 11 deletions.
diff --git a/backends/huggingface_multimodal_api.py b/backends/huggingface_multimodal_api.py
@@ -6,9 +6,15 @@
 import backends
 from PIL import Image
 import requests
-from transformers import AutoProcessor, AutoModelForVision2Seq, IdeficsForVisionText2Text
+from transformers import AutoProcessor, AutoModelForVision2Seq, IdeficsForVisionText2Text, AutoModelForCausalLM
 from jinja2 import Template
 
+# Define a map to load model from transformers Auto Classes
+MODEL_TYPE_MAP = {
+        "Idefics": IdeficsForVisionText2Text,
+        "Vision2Seq": AutoModelForVision2Seq
+    }
+
 logger = backends.get_logger(__name__)
 
 def load_processor(model_spec: backends.ModelSpec) -> AutoProcessor:
@@ -37,12 +43,12 @@ def load_model(model_spec: backends.ModelSpec) -> AutoModelForVision2Seq:
     logger.info(f'Start loading huggingface model weights: {model_spec.model_name}')
     hf_model_str = model_spec['huggingface_id'] # Get the model name
 
-    if model_spec['model_name'] != 'idefics-80b-instruct': 
-        model = AutoModelForVision2Seq.from_pretrained(hf_model_str, device_map="auto", torch_dtype="auto")
-    else:
-        model = IdeficsForVisionText2Text.from_pretrained(hf_model_str, device_map="auto", torch_dtype=torch.bfloat16)
+    model_type = MODEL_TYPE_MAP[model_spec['model_type']] # Use the appropriate Auto class to  load the model 
+
+    model = model_type.from_pretrained(hf_model_str, device_map="auto", torch_dtype="auto") # Load the model
 
     logger.info(f"Finished loading huggingface model: {model_spec.model_name}")
+    logger.info(f"Device Map: {model.hf_device_map}")
 
     return model
 
@@ -139,6 +145,7 @@ def __init__(self, model_spec: backends.ModelSpec):
         self.template = model_spec["custom_chat_template"]
         self.assistant_tag = model_spec["assistant"]
         self.image_placeholder = model_spec["placeholder"]
+
         self.padding = False
         self.IDEFICS = False
         if model_spec['model_name'] == 'idefics-80b-instruct':
@@ -166,9 +173,6 @@ def generate_response(self, messages: List[Dict],
         template = Template(template_str)
         prompt_text = template.render(messages=messages)
 
-        print("### PROMPT TEXT ###")
-        print(prompt_text)
-
         # Get a list of images that will be passed to the Processor
         images = get_images(prompt_text, messages, self.image_placeholder)
         if self.padding:
@@ -204,9 +208,7 @@ def generate_response(self, messages: List[Dict],
 
         # Store generated text
         response = {'response': generated_text}
-        print("### GENERATED RESPONSE ###")
-        print(response)
 
         response_text = generated_text[0].split(self.assistant_tag)[-1] # Get the last assistant response
 
-        return prompt, response, response_text
+        return prompt, response, response_text
diff --git a/backends/model_registry.json b/backends/model_registry.json
@@ -374,6 +374,7 @@
     "model_name": "llava-1.5-7b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-1.5-7b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "ASSISTANT",
     "padding": false,
@@ -384,6 +385,7 @@
     "model_name": "llava-1.5-13b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-1.5-13b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "ASSISTANT",
     "padding": false,
@@ -394,6 +396,7 @@
     "model_name": "vip-llava-7b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/vip-llava-7b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "ASSISTANT",
     "padding": false,
@@ -404,6 +407,7 @@
     "model_name": "llava-v1.6-34b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-v1.6-34b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "assistant",
     "padding": true,
@@ -414,6 +418,7 @@
     "model_name": "llava-v1.6-mistral-7b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-v1.6-mistral-7b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "[/INST]",
     "padding": true,
@@ -424,6 +429,7 @@
     "model_name": "llava-v1.6-vicuna-13b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-v1.6-vicuna-13b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "ASSISTANT",
     "padding": true,
@@ -434,6 +440,7 @@
     "model_name": "llava-v1.6-vicuna-7b-hf",
     "backend": "huggingface_multimodal",
     "huggingface_id": "llava-hf/llava-v1.6-vicuna-7b-hf",
+    "model_type": "Vision2Seq",
     "placeholder": "<image>",
     "assistant": "ASSISTANT",
     "padding": true,
@@ -444,6 +451,7 @@
     "model_name": "idefics-80b-instruct",
     "backend": "huggingface_multimodal",
     "huggingface_id": "HuggingFaceM4/idefics-80b-instruct",
+    "model_type": "Idefics",
     "placeholder": "",
     "assistant": "ASSISTANT:",
     "padding": false,