From 3304893796670d63e728f5000899b3286df91327 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 12 Nov 2024 01:49:21 -0500
Subject: [PATCH 01/12] lmms

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py    | 10 +++++++++-
 auto_round/script/mllm.py |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index ec093ff2..3968b617 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -40,8 +40,16 @@ def run_mllm():
     else:
         tune(args)
 
+def run_lmms():
+    from auto_round.script.lmms_eval import setup_lmms_args, eval
+    args = setup_lmms_args()
+    eval(args)
+
 def switch():
-    if "--mllm" in sys.argv:
+    if "--lmms" in sys.argv:
+        sys.argv.remove("--lmms")
+        run_lmms()
+    elif "--mllm" in sys.argv:
         sys.argv.remove("--mllm")
         run_mllm()
     else:
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index ca5d0351..b1554eb5 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -383,3 +383,5 @@ def eval(args):
         mode=args.mode,
         ignore=args.ignore
     )
+
+

From 79d07f079d1c0b87e99cb427dd05babdaca51f3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 12 Nov 2024 06:50:41 +0000
Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/script/lmms_eval.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/auto_round/script/lmms_eval.py b/auto_round/script/lmms_eval.py
index 296dcea8..2a14b685 100644
--- a/auto_round/script/lmms_eval.py
+++ b/auto_round/script/lmms_eval.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import datetime
 import importlib

From 145f16580a511acfe650b61c8e96fc32788de34e Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 12 Nov 2024 01:50:43 -0500
Subject: [PATCH 03/12] add file

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/script/lmms_eval.py | 511 +++++++++++++++++++++++++++++++++
 1 file changed, 511 insertions(+)
 create mode 100644 auto_round/script/lmms_eval.py

diff --git a/auto_round/script/lmms_eval.py b/auto_round/script/lmms_eval.py
new file mode 100644
index 00000000..296dcea8
--- /dev/null
+++ b/auto_round/script/lmms_eval.py
@@ -0,0 +1,511 @@
+import argparse
+import datetime
+import importlib
+import json
+import os
+import sys
+import traceback
+from typing import Union
+from functools import partial
+import logging
+import numpy as np
+import yaml
+
+from accelerate import Accelerator
+from accelerate.utils import InitProcessGroupKwargs
+
+from loguru import logger as eval_logger
+from lmms_eval import evaluator, utils
+from lmms_eval.api.registry import ALL_TASKS
+from lmms_eval.evaluator import request_caching_arg_to_dict
+from lmms_eval.loggers import EvaluationTracker, WandbLogger
+from lmms_eval.tasks import TaskManager
+from lmms_eval.utils import (
+    make_table,
+    simple_parse_args_string,
+)
+
+def _int_or_none_list_arg_type(min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","):
+    def parse_value(item):
+        item = item.strip().lower()
+        if item == "none":
+            return None
+        try:
+            return int(item)
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
+
+    items = [parse_value(v) for v in value.split(split_char)]
+    num_items = len(items)
+
+    if num_items == 1:
+        # Makes downstream handling the same for single and multiple values
+        items = items * max_len
+    elif num_items < min_len or num_items > max_len:
+        raise argparse.ArgumentTypeError(f"Argument requires {max_len} integers or None, separated by '{split_char}'")
+    elif num_items != max_len:
+        logging.warning(f"Argument requires {max_len} integers or None, separated by '{split_char}'. " "Missing values will be filled with defaults.")
+        default_items = [parse_value(v) for v in defaults.split(split_char)]
+        items.extend(default_items[num_items:])  # extend items list with missing defaults
+
+    return items
+
+
+def setup_lmms_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument("--config", default="", help="Path to a yaml file specifying all eval arguments, will ignore cli arguments if specified")
+    parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`, llava,qwen2_vl,llama_vision,phi3v,cogvlm2")
+    parser.add_argument(
+        "--tasks",
+        default="pope,textvqa_val,scienceqa,mmbench_en",
+        help="To get full list of tasks, use the command lmms-eval --tasks list",
+    )
+    parser.add_argument(
+        "--model_args",
+        default="",
+        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
+    )
+    parser.add_argument(
+        "--num_fewshot",
+        type=int,
+        default=None,
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu)",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        metavar="= [dir/file.jsonl] [DIR]",
+        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+    )
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument(
+        "--use_cache",
+        "-c",
+        type=str,
+        default=None,
+        metavar="DIR",
+        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--cache_requests",
+        type=str,
+        default=None,
+        choices=["true", "refresh", "delete"],
+        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
+    )
+    parser.add_argument(
+        "--check_integrity",
+        action="store_true",
+        help="Whether to run the relevant part of the test suite for the tasks",
+    )
+    parser.add_argument(
+        "--write_out",
+        "-w",
+        action="store_true",
+        default=False,
+        help="Prints the prompt for the first few documents.",
+    )
+    parser.add_argument(
+        "--log_samples",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
+    )
+    parser.add_argument(
+        "--wandb_log_samples",
+        action="store_true",
+        default=False,
+        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases",
+    )
+    parser.add_argument(
+        "--log_samples_suffix",
+        type=str,
+        default="model_outputs",
+        help="Specify a suffix for the log_samples file name.",
+    )
+    parser.add_argument(
+        "--system_instruction",
+        type=str,
+        default=None,
+        help="System instruction to be used in the prompt",
+    )
+    parser.add_argument(
+        "--apply_chat_template",
+        action="store_true",
+        default=False,
+        help="If True, applies the chat template to the prompt",
+    )
+    parser.add_argument(
+        "--fewshot_as_multiturn",
+        action="store_true",
+        default=False,
+        help="If True, uses the fewshot as a multi-turn conversation",
+    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        help="Additional path to include if there are external tasks to include.",
+    )
+    parser.add_argument(
+        "--gen_kwargs",
+        default="",
+        help=("String arguments for model generation on greedy_until tasks," " e.g. `temperature=0,top_k=0,top_p=0`"),
+    )
+    parser.add_argument(
+        "--verbosity",
+        type=str,
+        default="INFO",
+        help="Log error when tasks are not registered.",
+    )
+    parser.add_argument(
+        "--wandb_args",
+        default="",
+        help="Comma separated string arguments passed to wandb.init, e.g. `project=lmms-eval,job_type=eval",
+    )
+    parser.add_argument(
+        "--timezone",
+        default="Asia/Singapore",
+        help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles. You can check the full list via `import pytz; print(pytz.common_timezones)`",
+    )
+    parser.add_argument(
+        "--hf_hub_log_args",
+        type=str,
+        default="",
+        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
+    )
+    parser.add_argument(
+        "--predict_only",
+        "-x",
+        action="store_true",
+        default=False,
+        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
+    )
+    default_seed_string = "0,1234,1234,1234"
+    parser.add_argument(
+        "--seed",
+        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
+        default=default_seed_string,  # for backward compatibility
+        help=(
+            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
+            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
+            "respectively, or a single integer to set the same seed for all four.\n"
+            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
+            "(for backward compatibility).\n"
+            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
+            "Here numpy's seed is not set since the second value is `None`.\n"
+            "E.g, `--seed 42` sets all four seeds to 42."
+        ),
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
+    )
+    args = parser.parse_args()
+    return args
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
+    # Check if no arguments were passed after parsing
+    if len(sys.argv) == 1:
+        print("┌───────────────────────────────────────────────────────────────────────────────┐")
+        print("│ Please provide arguments to evaluate the model. e.g.                          │")
+        print("│ `lmms-eval --model llava --model_path liuhaotian/llava-v1.6-7b --tasks okvqa` │")
+        print("│ Use `lmms-eval --help` for more information.                                  │")
+        print("└───────────────────────────────────────────────────────────────────────────────┘")
+        sys.exit(1)
+
+    if args.wandb_args:
+        if "name" not in args.wandb_args:
+            name = f"{args.model}_{args.model_args}_{utils.get_datetime_str(timezone=args.timezone)}"
+            name = utils.sanitize_long_string(name)
+            args.wandb_args += f",name={name}"
+        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
+
+    # reset logger
+    eval_logger.remove()
+    eval_logger.add(sys.stdout, colorize=True, level=args.verbosity)
+    eval_logger.info(f"Verbosity set to {args.verbosity}")
+    os.environ["VERBOSITY"] = args.verbosity
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    args_list = []
+    results_list = []
+    if args.config:
+        if not os.path.exists(args.config):
+            raise ValueError(f"Config file does not exist: {args.config}")
+
+        with open(args.config, "r") as file:
+            config_args = yaml.safe_load(file)
+        config_args = [config_args] if type(config_args) != list else config_args
+        # multiple configs, create args list first
+        for config in config_args:
+            args_copy = argparse.Namespace(**vars(args))
+            for key, value in config.items():
+                setattr(args_copy, key, value)
+            args_list.append(args_copy)
+    else:
+        args_list.append(args)
+
+    # initialize Accelerator
+    kwargs_handler = InitProcessGroupKwargs(timeout=datetime.timedelta(seconds=60000))
+    accelerator = Accelerator(kwargs_handlers=[kwargs_handler])
+    if accelerator.is_main_process:
+        is_main_process = True
+    else:
+        is_main_process = False
+
+    for args in args_list:
+        try:
+            # if is_main_process and args.wandb_args:  # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors.
+            #     wandb_logger = WandbLogger()
+
+            results, samples = cli_evaluate_single(args)
+            results_list.append(results)
+
+            accelerator.wait_for_everyone()
+            if is_main_process and args.wandb_args:
+                try:
+                    wandb_logger.post_init(results)
+                    wandb_logger.log_eval_result()
+                    if args.wandb_log_samples and samples is not None:
+                        wandb_logger.log_eval_samples(samples)
+                except Exception as e:
+                    eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
+                # wandb_logger.finish()
+
+        except Exception as e:
+            if args.verbosity == "DEBUG":
+                raise e
+            else:
+                traceback.print_exc()
+                eval_logger.error(f"Error during evaluation: {e}. Please set `--verbosity=DEBUG` to get more information.")
+                results_list.append(None)
+
+    for args, results in zip(args_list, results_list):
+        # cli_evaluate will return none if the process is not the main process (rank 0)
+        if results is not None:
+            print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}")
+            print(make_table(results))
+            if "groups" in results:
+                print(make_table(results, "groups"))
+
+    if args.wandb_args:
+        wandb_logger.run.finish()
+
+
+def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
+    selected_task_list = args.tasks.split(",") if args.tasks else None
+
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+    task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model)
+
+    # update the evaluation tracker args with the output path and the HF token
+    if args.output_path:
+        args.hf_hub_log_args += f",output_path={args.output_path}"
+    if os.environ.get("HF_TOKEN", None):
+        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
+
+    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
+    eval_logger.info(f"Evaluation tracker args: {evaluation_tracker_args}")
+
+    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
+
+    if args.predict_only:
+        args.log_samples = True
+    if (args.log_samples or args.predict_only) and not args.output_path:
+        raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
+
+    if args.fewshot_as_multiturn and args.apply_chat_template is False:
+        raise ValueError("If fewshot_as_multiturn is set, apply_chat_template must be set to True.")
+
+    if (args.num_fewshot is None or args.num_fewshot == 0) and args.fewshot_as_multiturn:
+        raise ValueError("If fewshot_as_multiturn is set, num_fewshot must be greater than 0.")
+
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+
+    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
+        eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.")
+
+    if args.limit:
+        eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
+
+    if os.environ.get("LMMS_EVAL_PLUGINS", None):
+        args.include_path = [args.include_path] if args.include_path else []
+        for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
+            package_tasks_location = importlib.util.find_spec(f"{plugin}.tasks").submodule_search_locations[0]
+            args.include_path.append(package_tasks_location)
+
+    if args.tasks is None:
+        eval_logger.error("Need to specify task to evaluate.")
+        sys.exit()
+    elif args.tasks == "list":
+        eval_logger.info("Available Tasks:\n - {}".format(task_manager.list_all_tasks()))
+        sys.exit()
+    elif args.tasks == "list_groups":
+        eval_logger.info(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_tags":
+        eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
+        sys.exit()
+    elif args.tasks == "list_subtasks":
+        eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False))
+        sys.exit()
+    elif args.tasks == "list_with_num":
+        log_message = (
+            "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
+        )
+        eval_logger.info(log_message)
+        for task_name in sorted(task_manager.list_all_tasks()):
+            try:
+                task_dict = get_task_dict([task_name], model_name="llava")
+                task_obj = task_dict[task_name]
+                if type(task_obj) == tuple:
+                    group, task_obj = task_obj
+                    if task_obj is None:
+                        continue
+                eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
+            except Exception as e:
+                eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
+        sys.exit()
+    else:
+        if os.path.isdir(args.tasks):
+            import glob
+
+            task_names = []
+            yaml_path = os.path.join(args.tasks, "*.yaml")
+            for yaml_file in glob.glob(yaml_path):
+                config = utils.load_yaml_config(yaml_file)
+                task_names.append(config)
+        else:
+            task_list = args.tasks.split(",")
+            task_names = task_manager.match_tasks(task_list)
+            for task in [task for task in task_list if task not in task_names]:
+                if os.path.isfile(task):
+                    config = utils.load_yaml_config(task)
+                    task_names.append(config)
+            task_missing = [task for task in task_list if task not in task_names and "*" not in task]  # we don't want errors if a wildcard ("*") task name was used
+
+            if task_missing:
+                missing = ", ".join(task_missing)
+                eval_logger.error(
+                    f"Tasks were not found: {missing}\n" f"{utils.SPACING}Try `lmms-eval --tasks list` for list of available tasks",
+                )
+                raise ValueError(
+                    f"Tasks not found: {missing}. Try `lmms-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
+                )
+
+    eval_logger.info(f"Selected Tasks: {task_names}")
+    request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)
+    datetime_str = utils.get_datetime_str(timezone=args.timezone)
+
+    if args.model == "phi3v":
+        args.model_args = args.model_args.replace("pretrained", "model_id_name")
+    results = evaluator.simple_evaluate(
+        model=args.model,
+        model_args=args.model_args,
+        tasks=task_names,
+        num_fewshot=args.num_fewshot,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=args.use_cache,
+        limit=args.limit,
+        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        log_samples=args.log_samples,
+        evaluation_tracker=evaluation_tracker,
+        system_instruction=args.system_instruction,
+        apply_chat_template=args.apply_chat_template,
+        fewshot_as_multiturn=args.fewshot_as_multiturn,
+        gen_kwargs=args.gen_kwargs,
+        task_manager=task_manager,
+        verbosity=args.verbosity,
+        predict_only=args.predict_only,
+        random_seed=args.seed[0],
+        numpy_random_seed=args.seed[1],
+        torch_random_seed=args.seed[2],
+        fewshot_random_seed=args.seed[3],
+        cli_args=args,
+        datetime_str=datetime_str,
+        **request_caching_args,
+    )
+
+    if results is not None:
+        if args.log_samples:
+            samples = results.pop("samples")
+        else:
+            samples = None
+        dumped = json.dumps(results, indent=4, default=_handle_non_serializable)
+        if args.show_config:
+            print(dumped)
+
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
+
+        evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str)
+
+        if args.log_samples:
+            for task_name, config in results["configs"].items():
+                evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])
+
+        if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub:
+            evaluation_tracker.recreate_metadata_card()
+
+        return results, samples
+    return None, None
+
+from lmms_eval import evaluator
+from lmms_eval.utils import make_table
+
+
+def eval(args):
+    try:
+        from auto_round import AutoRoundConfig
+    except:
+        from auto_round.auto_quantizer import AutoHfQuantizer
+    
+    cli_evaluate(args)
+    
\ No newline at end of file

From 47146677136c0c39fb4ce61c9baa44eda854dc93 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 13 Nov 2024 01:33:09 -0500
Subject: [PATCH 04/12] similar usage of lmms and vlmeval

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py         |  12 +-
 auto_round/mllm/__init__.py    |   2 +-
 auto_round/mllm/eval.py        | 114 ++++++++
 auto_round/script/lmms_eval.py | 511 ---------------------------------
 auto_round/script/mllm.py      |  72 ++++-
 5 files changed, 193 insertions(+), 518 deletions(-)
 delete mode 100644 auto_round/script/lmms_eval.py

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 3968b617..8815c8c0 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -41,9 +41,15 @@ def run_mllm():
         tune(args)
 
 def run_lmms():
-    from auto_round.script.lmms_eval import setup_lmms_args, eval
-    args = setup_lmms_args()
-    eval(args)
+    try:
+        import importlib
+        importlib.import_module("lmms_eval")
+    except:
+        raise ImportError("please install the lmms_eval firt.")
+    # from auto_round.script.lmms_eval import setup_lmms_args, eval
+    from auto_round.script.mllm import setup_lmms_parser, lmms_eval
+    args = setup_lmms_parser()
+    lmms_eval(args)
 
 def switch():
     if "--lmms" in sys.argv:
diff --git a/auto_round/mllm/__init__.py b/auto_round/mllm/__init__.py
index f42a4f48..41858319 100644
--- a/auto_round/mllm/__init__.py
+++ b/auto_round/mllm/__init__.py
@@ -16,4 +16,4 @@
 from .template import Template, get_template, TEMPLATES
 from .autoround_mllm import AutoRoundMLLM
 from ..utils import LazyImport
-from .eval import mllm_eval
\ No newline at end of file
+from .eval import mllm_eval, lmms_eval
\ No newline at end of file
diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py
index 58f8ba18..817174b8 100644
--- a/auto_round/mllm/eval.py
+++ b/auto_round/mllm/eval.py
@@ -26,6 +26,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Copyright (c) 2024 LMMs-Lab
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 import os
 import time
 import json
@@ -33,6 +53,8 @@
 
 import pandas as pd
 from ..utils import logger, LazyImport
+import numpy as np
+
 vlmeval = LazyImport("vlmeval")
 
 
@@ -284,3 +306,95 @@ def mllm_eval(
             continue
     rt_file.write('%d tasks cost: %.4fs\n' % (len(dataset), time.time() - st)) 
     rt_file.close()
+
+
+MODEL_TYPE_TO_LMMS_MODEL = {
+    # model_name
+    "Qwen-VL": "qwen_vl",
+    "Qwen2-VL": "qwen2_vl",
+    "cogvlm2": "cogvlm2",
+    "llava_v1.5": "llava",
+    "Llama-3.2": "llama_vision",
+    "Phi-3-vision": "phi3v",
+    "Phi-3.5-vision": "phi3v",
+
+    # model_type
+    "qwen2_vl": "qwen2_vl",
+    "qwen": "qwen_vl",
+    "llava": "llava",
+    "phi3_v": "phi3v",
+    "mllama": "llama_vision",
+}
+
+_lmms_eval = LazyImport("lmms_eval")
+
+def _handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
+
+def lmms_eval(
+        model,
+        tasks,
+        output_dir = None,
+        num_fewshot=None,
+        limit=None,
+        batch_size=1,
+        max_batch_size=None,
+        device='cpu',
+        use_cache=None,
+        apply_chat_template=False
+        ):
+    if isinstance(tasks, str):
+        tasks = tasks.replace(' ', '').split(',') 
+
+    model_name = model
+    if model_name[-1] == "/":
+        model_name = model_name[:-1]
+    model_name = model_name.split("/")[-1]
+
+    model_type = None
+    split_name = model_name.split("-")
+    for i in range(len(split_name), 0, -1):
+        tmp = "-".join(split_name[0:i])
+        if tmp in MODEL_TYPE_TO_LMMS_MODEL:
+            model_type = tmp
+            break
+    if model_type is None:
+        from transformers import AutoConfig
+        config = AutoConfig.from_pretrained(model, trust_remote_code=True)
+        model_type = config.model_type
+    
+    assert model_type in MODEL_TYPE_TO_LMMS_MODEL, f"{model_type} is not support by lmms."
+
+    if MODEL_TYPE_TO_LMMS_MODEL[model_type] == "phi3v":
+        model_args = f"model_id_name={model}"
+    else:
+        model_args = f"pretrained={model}"
+    results = _lmms_eval.evaluator.simple_evaluate(
+        model=MODEL_TYPE_TO_LMMS_MODEL[model_type],
+        model_args=model_args,
+        tasks=tasks,
+        num_fewshot=num_fewshot,
+        limit=limit,
+        batch_size=batch_size,
+        max_batch_size=max_batch_size,
+        device=device,
+        use_cache=use_cache,
+        apply_chat_template=apply_chat_template,
+    )
+
+    # print and save result
+    print(_lmms_eval.utils.make_table(results))
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+        
+        from datetime import datetime
+        now = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_file = os.path.join(output_dir, f"{model_name}_{now}_result.json")
+        json.dump(results, open(output_file, 'w'), indent=4, default=_handle_non_serializable)
+
+    return results
diff --git a/auto_round/script/lmms_eval.py b/auto_round/script/lmms_eval.py
deleted file mode 100644
index 296dcea8..00000000
--- a/auto_round/script/lmms_eval.py
+++ /dev/null
@@ -1,511 +0,0 @@
-import argparse
-import datetime
-import importlib
-import json
-import os
-import sys
-import traceback
-from typing import Union
-from functools import partial
-import logging
-import numpy as np
-import yaml
-
-from accelerate import Accelerator
-from accelerate.utils import InitProcessGroupKwargs
-
-from loguru import logger as eval_logger
-from lmms_eval import evaluator, utils
-from lmms_eval.api.registry import ALL_TASKS
-from lmms_eval.evaluator import request_caching_arg_to_dict
-from lmms_eval.loggers import EvaluationTracker, WandbLogger
-from lmms_eval.tasks import TaskManager
-from lmms_eval.utils import (
-    make_table,
-    simple_parse_args_string,
-)
-
-def _int_or_none_list_arg_type(min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","):
-    def parse_value(item):
-        item = item.strip().lower()
-        if item == "none":
-            return None
-        try:
-            return int(item)
-        except ValueError:
-            raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
-
-    items = [parse_value(v) for v in value.split(split_char)]
-    num_items = len(items)
-
-    if num_items == 1:
-        # Makes downstream handling the same for single and multiple values
-        items = items * max_len
-    elif num_items < min_len or num_items > max_len:
-        raise argparse.ArgumentTypeError(f"Argument requires {max_len} integers or None, separated by '{split_char}'")
-    elif num_items != max_len:
-        logging.warning(f"Argument requires {max_len} integers or None, separated by '{split_char}'. " "Missing values will be filled with defaults.")
-        default_items = [parse_value(v) for v in defaults.split(split_char)]
-        items.extend(default_items[num_items:])  # extend items list with missing defaults
-
-    return items
-
-
-def setup_lmms_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--config", default="", help="Path to a yaml file specifying all eval arguments, will ignore cli arguments if specified")
-    parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`, llava,qwen2_vl,llama_vision,phi3v,cogvlm2")
-    parser.add_argument(
-        "--tasks",
-        default="pope,textvqa_val,scienceqa,mmbench_en",
-        help="To get full list of tasks, use the command lmms-eval --tasks list",
-    )
-    parser.add_argument(
-        "--model_args",
-        default="",
-        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
-    )
-    parser.add_argument(
-        "--num_fewshot",
-        type=int,
-        default=None,
-        help="Number of examples in few-shot context",
-    )
-    parser.add_argument(
-        "--batch_size",
-        "-b",
-        type=str,
-        default=1,
-        metavar="auto|auto:N|N",
-        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=None,
-        metavar="N",
-        help="Maximal batch size to try with --batch_size auto.",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default=None,
-        help="Device to use (e.g. cuda, cuda:0, cpu)",
-    )
-    parser.add_argument(
-        "--output_path",
-        default=None,
-        type=str,
-        metavar="= [dir/file.jsonl] [DIR]",
-        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
-    )
-    parser.add_argument(
-        "--limit",
-        type=float,
-        default=None,
-        help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.",
-    )
-    parser.add_argument(
-        "--use_cache",
-        "-c",
-        type=str,
-        default=None,
-        metavar="DIR",
-        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
-    )
-    parser.add_argument(
-        "--cache_requests",
-        type=str,
-        default=None,
-        choices=["true", "refresh", "delete"],
-        help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
-    )
-    parser.add_argument(
-        "--check_integrity",
-        action="store_true",
-        help="Whether to run the relevant part of the test suite for the tasks",
-    )
-    parser.add_argument(
-        "--write_out",
-        "-w",
-        action="store_true",
-        default=False,
-        help="Prints the prompt for the first few documents.",
-    )
-    parser.add_argument(
-        "--log_samples",
-        action="store_true",
-        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
-    )
-    parser.add_argument(
-        "--wandb_log_samples",
-        action="store_true",
-        default=False,
-        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases",
-    )
-    parser.add_argument(
-        "--log_samples_suffix",
-        type=str,
-        default="model_outputs",
-        help="Specify a suffix for the log_samples file name.",
-    )
-    parser.add_argument(
-        "--system_instruction",
-        type=str,
-        default=None,
-        help="System instruction to be used in the prompt",
-    )
-    parser.add_argument(
-        "--apply_chat_template",
-        action="store_true",
-        default=False,
-        help="If True, applies the chat template to the prompt",
-    )
-    parser.add_argument(
-        "--fewshot_as_multiturn",
-        action="store_true",
-        default=False,
-        help="If True, uses the fewshot as a multi-turn conversation",
-    )
-    parser.add_argument(
-        "--show_config",
-        action="store_true",
-        default=False,
-        help="If True, shows the the full config of all tasks at the end of the evaluation.",
-    )
-    parser.add_argument(
-        "--include_path",
-        type=str,
-        default=None,
-        help="Additional path to include if there are external tasks to include.",
-    )
-    parser.add_argument(
-        "--gen_kwargs",
-        default="",
-        help=("String arguments for model generation on greedy_until tasks," " e.g. `temperature=0,top_k=0,top_p=0`"),
-    )
-    parser.add_argument(
-        "--verbosity",
-        type=str,
-        default="INFO",
-        help="Log error when tasks are not registered.",
-    )
-    parser.add_argument(
-        "--wandb_args",
-        default="",
-        help="Comma separated string arguments passed to wandb.init, e.g. `project=lmms-eval,job_type=eval",
-    )
-    parser.add_argument(
-        "--timezone",
-        default="Asia/Singapore",
-        help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles. You can check the full list via `import pytz; print(pytz.common_timezones)`",
-    )
-    parser.add_argument(
-        "--hf_hub_log_args",
-        type=str,
-        default="",
-        help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
-    )
-    parser.add_argument(
-        "--predict_only",
-        "-x",
-        action="store_true",
-        default=False,
-        help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
-    )
-    default_seed_string = "0,1234,1234,1234"
-    parser.add_argument(
-        "--seed",
-        type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
-        default=default_seed_string,  # for backward compatibility
-        help=(
-            "Set seed for python's random, numpy, torch, and fewshot sampling.\n"
-            "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
-            "respectively, or a single integer to set the same seed for all four.\n"
-            f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
-            "(for backward compatibility).\n"
-            "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
-            "Here numpy's seed is not set since the second value is `None`.\n"
-            "E.g, `--seed 42` sets all four seeds to 42."
-        ),
-    )
-    parser.add_argument(
-        "--trust_remote_code",
-        action="store_true",
-        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
-    )
-    args = parser.parse_args()
-    return args
-
-def _handle_non_serializable(o):
-    if isinstance(o, np.int64) or isinstance(o, np.int32):
-        return int(o)
-    elif isinstance(o, set):
-        return list(o)
-    else:
-        return str(o)
-
-def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
-    # Check if no arguments were passed after parsing
-    if len(sys.argv) == 1:
-        print("┌───────────────────────────────────────────────────────────────────────────────┐")
-        print("│ Please provide arguments to evaluate the model. e.g.                          │")
-        print("│ `lmms-eval --model llava --model_path liuhaotian/llava-v1.6-7b --tasks okvqa` │")
-        print("│ Use `lmms-eval --help` for more information.                                  │")
-        print("└───────────────────────────────────────────────────────────────────────────────┘")
-        sys.exit(1)
-
-    if args.wandb_args:
-        if "name" not in args.wandb_args:
-            name = f"{args.model}_{args.model_args}_{utils.get_datetime_str(timezone=args.timezone)}"
-            name = utils.sanitize_long_string(name)
-            args.wandb_args += f",name={name}"
-        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
-
-    # reset logger
-    eval_logger.remove()
-    eval_logger.add(sys.stdout, colorize=True, level=args.verbosity)
-    eval_logger.info(f"Verbosity set to {args.verbosity}")
-    os.environ["VERBOSITY"] = args.verbosity
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    args_list = []
-    results_list = []
-    if args.config:
-        if not os.path.exists(args.config):
-            raise ValueError(f"Config file does not exist: {args.config}")
-
-        with open(args.config, "r") as file:
-            config_args = yaml.safe_load(file)
-        config_args = [config_args] if type(config_args) != list else config_args
-        # multiple configs, create args list first
-        for config in config_args:
-            args_copy = argparse.Namespace(**vars(args))
-            for key, value in config.items():
-                setattr(args_copy, key, value)
-            args_list.append(args_copy)
-    else:
-        args_list.append(args)
-
-    # initialize Accelerator
-    kwargs_handler = InitProcessGroupKwargs(timeout=datetime.timedelta(seconds=60000))
-    accelerator = Accelerator(kwargs_handlers=[kwargs_handler])
-    if accelerator.is_main_process:
-        is_main_process = True
-    else:
-        is_main_process = False
-
-    for args in args_list:
-        try:
-            # if is_main_process and args.wandb_args:  # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors.
-            #     wandb_logger = WandbLogger()
-
-            results, samples = cli_evaluate_single(args)
-            results_list.append(results)
-
-            accelerator.wait_for_everyone()
-            if is_main_process and args.wandb_args:
-                try:
-                    wandb_logger.post_init(results)
-                    wandb_logger.log_eval_result()
-                    if args.wandb_log_samples and samples is not None:
-                        wandb_logger.log_eval_samples(samples)
-                except Exception as e:
-                    eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
-                # wandb_logger.finish()
-
-        except Exception as e:
-            if args.verbosity == "DEBUG":
-                raise e
-            else:
-                traceback.print_exc()
-                eval_logger.error(f"Error during evaluation: {e}. Please set `--verbosity=DEBUG` to get more information.")
-                results_list.append(None)
-
-    for args, results in zip(args_list, results_list):
-        # cli_evaluate will return none if the process is not the main process (rank 0)
-        if results is not None:
-            print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}")
-            print(make_table(results))
-            if "groups" in results:
-                print(make_table(results, "groups"))
-
-    if args.wandb_args:
-        wandb_logger.run.finish()
-
-
-def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
-    selected_task_list = args.tasks.split(",") if args.tasks else None
-
-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-    task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model)
-
-    # update the evaluation tracker args with the output path and the HF token
-    if args.output_path:
-        args.hf_hub_log_args += f",output_path={args.output_path}"
-    if os.environ.get("HF_TOKEN", None):
-        args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
-
-    evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
-    eval_logger.info(f"Evaluation tracker args: {evaluation_tracker_args}")
-
-    evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
-
-    if args.predict_only:
-        args.log_samples = True
-    if (args.log_samples or args.predict_only) and not args.output_path:
-        raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
-
-    if args.fewshot_as_multiturn and args.apply_chat_template is False:
-        raise ValueError("If fewshot_as_multiturn is set, apply_chat_template must be set to True.")
-
-    if (args.num_fewshot is None or args.num_fewshot == 0) and args.fewshot_as_multiturn:
-        raise ValueError("If fewshot_as_multiturn is set, num_fewshot must be greater than 0.")
-
-    if args.include_path is not None:
-        eval_logger.info(f"Including path: {args.include_path}")
-
-    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
-        eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.")
-
-    if args.limit:
-        eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
-
-    if os.environ.get("LMMS_EVAL_PLUGINS", None):
-        args.include_path = [args.include_path] if args.include_path else []
-        for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","):
-            package_tasks_location = importlib.util.find_spec(f"{plugin}.tasks").submodule_search_locations[0]
-            args.include_path.append(package_tasks_location)
-
-    if args.tasks is None:
-        eval_logger.error("Need to specify task to evaluate.")
-        sys.exit()
-    elif args.tasks == "list":
-        eval_logger.info("Available Tasks:\n - {}".format(task_manager.list_all_tasks()))
-        sys.exit()
-    elif args.tasks == "list_groups":
-        eval_logger.info(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
-        sys.exit()
-    elif args.tasks == "list_tags":
-        eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
-        sys.exit()
-    elif args.tasks == "list_subtasks":
-        eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False))
-        sys.exit()
-    elif args.tasks == "list_with_num":
-        log_message = (
-            "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
-        )
-        eval_logger.info(log_message)
-        for task_name in sorted(task_manager.list_all_tasks()):
-            try:
-                task_dict = get_task_dict([task_name], model_name="llava")
-                task_obj = task_dict[task_name]
-                if type(task_obj) == tuple:
-                    group, task_obj = task_obj
-                    if task_obj is None:
-                        continue
-                eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
-            except Exception as e:
-                eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
-        sys.exit()
-    else:
-        if os.path.isdir(args.tasks):
-            import glob
-
-            task_names = []
-            yaml_path = os.path.join(args.tasks, "*.yaml")
-            for yaml_file in glob.glob(yaml_path):
-                config = utils.load_yaml_config(yaml_file)
-                task_names.append(config)
-        else:
-            task_list = args.tasks.split(",")
-            task_names = task_manager.match_tasks(task_list)
-            for task in [task for task in task_list if task not in task_names]:
-                if os.path.isfile(task):
-                    config = utils.load_yaml_config(task)
-                    task_names.append(config)
-            task_missing = [task for task in task_list if task not in task_names and "*" not in task]  # we don't want errors if a wildcard ("*") task name was used
-
-            if task_missing:
-                missing = ", ".join(task_missing)
-                eval_logger.error(
-                    f"Tasks were not found: {missing}\n" f"{utils.SPACING}Try `lmms-eval --tasks list` for list of available tasks",
-                )
-                raise ValueError(
-                    f"Tasks not found: {missing}. Try `lmms-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
-                )
-
-    eval_logger.info(f"Selected Tasks: {task_names}")
-    request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)
-    datetime_str = utils.get_datetime_str(timezone=args.timezone)
-
-    if args.model == "phi3v":
-        args.model_args = args.model_args.replace("pretrained", "model_id_name")
-    results = evaluator.simple_evaluate(
-        model=args.model,
-        model_args=args.model_args,
-        tasks=task_names,
-        num_fewshot=args.num_fewshot,
-        batch_size=args.batch_size,
-        max_batch_size=args.max_batch_size,
-        device=args.device,
-        use_cache=args.use_cache,
-        limit=args.limit,
-        check_integrity=args.check_integrity,
-        write_out=args.write_out,
-        log_samples=args.log_samples,
-        evaluation_tracker=evaluation_tracker,
-        system_instruction=args.system_instruction,
-        apply_chat_template=args.apply_chat_template,
-        fewshot_as_multiturn=args.fewshot_as_multiturn,
-        gen_kwargs=args.gen_kwargs,
-        task_manager=task_manager,
-        verbosity=args.verbosity,
-        predict_only=args.predict_only,
-        random_seed=args.seed[0],
-        numpy_random_seed=args.seed[1],
-        torch_random_seed=args.seed[2],
-        fewshot_random_seed=args.seed[3],
-        cli_args=args,
-        datetime_str=datetime_str,
-        **request_caching_args,
-    )
-
-    if results is not None:
-        if args.log_samples:
-            samples = results.pop("samples")
-        else:
-            samples = None
-        dumped = json.dumps(results, indent=4, default=_handle_non_serializable)
-        if args.show_config:
-            print(dumped)
-
-        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
-
-        evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str)
-
-        if args.log_samples:
-            for task_name, config in results["configs"].items():
-                evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])
-
-        if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub:
-            evaluation_tracker.recreate_metadata_card()
-
-        return results, samples
-    return None, None
-
-from lmms_eval import evaluator
-from lmms_eval.utils import make_table
-
-
-def eval(args):
-    try:
-        from auto_round import AutoRoundConfig
-    except:
-        from auto_round.auto_quantizer import AutoHfQuantizer
-    
-    cli_evaluate(args)
-    
\ No newline at end of file
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
index b1554eb5..46d79e7a 100644
--- a/auto_round/script/mllm.py
+++ b/auto_round/script/mllm.py
@@ -14,6 +14,7 @@
 
 import os
 import argparse
+import json
 
 import torch
 import transformers
@@ -278,9 +279,9 @@ def tune(args):
         else:
             cls = AutoModelForCausalLM
     
-    model = cls.from_pretrained(
-            model_name,trust_remote_code=not args.disable_trust_remote_code, torch_dtype=torch_dtype,
-        device_map="auto" if use_auto_mapping else None)
+        model = cls.from_pretrained(
+                model_name,trust_remote_code=not args.disable_trust_remote_code, torch_dtype=torch_dtype,
+            device_map="auto" if use_auto_mapping else None)
     if "cogvlm2" in model_name:
         model.config.model_type = "cogvlm2"
 
@@ -384,4 +385,69 @@ def eval(args):
         ignore=args.ignore
     )
 
+def setup_lmms_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", "--model_name", "--model_name_or_path",
+                          help="model name or path")
+    parser.add_argument(
+        "--tasks",
+        default="pope,textvqa_val,scienceqa,mmbench_en",
+        help="To get full list of tasks, use the command lmms-eval --tasks list",
+    )
+    parser.add_argument("--output_dir", default="./tmp_autoround", type=str,
+                          help="the directory to save quantized model")
+    parser.add_argument(
+        "--num_fewshot",
+        type=int,
+        default=None,
+        help="Number of examples in few-shot context",
+    )
+    parser.add_argument(
+        "--batch_size",
+        "-b",
+        type=str,
+        default=1,
+        metavar="auto|auto:N|N",
+        help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        metavar="N",
+        help="Maximal batch size to try with --batch_size auto.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="Device to use (e.g. cuda, cuda:0, cpu)",
+    )
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. " "If <1, limit is a percentage of the total"
+        " number of examples.",
+    )
+    args = parser.parse_args()
+    return args
+
+def lmms_eval(args):
+    from auto_round.mllm import lmms_eval
+
+    results = lmms_eval(
+        model=args.model,
+        tasks=args.tasks,
+        output_dir=args.output_dir,
+        num_fewshot=args.num_fewshot,
+        limit=args.limit,
+        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
+        device=args.device,
+        use_cache=None,
+        apply_chat_template=False,
+    )
+    return results
+
 

From eab118813cdf3bdf292a13213106043effbaf30c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 13 Nov 2024 07:12:43 +0000
Subject: [PATCH 05/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/mllm/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index d55236b2..246ad02b 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -37,7 +37,7 @@ autoround.save_quantized(output_dir, format='auto_round', inplace=True)
 ```
 
 ### Dataset
-For mllm, we used liuhaotian/llava_conv_58k as our defalt calib datasets. Through command ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
+For mllm, we used liuhaotian/llava_conv_58k as our default calib datasets. Through command ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
 
 ### Limitation
 So far, auto-round for mllm supports five model families, include Qwen2, Llama, Phi3v, Llava and CogVLM2.

From 5351ca36b98157c50018de00c1f83712a08ae72e Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 13 Nov 2024 02:12:45 -0500
Subject: [PATCH 06/12] update readme

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/mllm/README.md | 30 +++++++++++++++++++++++++++---
 auto_round/mllm/eval.py   |  5 +++++
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index 9060c168..d55236b2 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -1,4 +1,19 @@
 # AutoRound for MLLMs
+## Basic Usage (Gaudi2/CPU/GPU)
+A user guide detailing the full list of supported arguments is provided by calling ```auto-round -h``` on the terminal.Alternatively, you can use ```auto_round``` instead of ```auto-round```. Set the format you want in `format` and
+multiple formats exporting has been supported.
+
+```bash
+auto—round-mllm \
+    --model Qwen/Qwen2-VL-2B-Instruct\
+    --bits 4 \
+    --batch_size 1 \
+    --nsamples 128 \
+    --gradient_accumulate_steps 4 \
+    --group_size 128 \
+    --format "auto_round" \
+    --output_dir ./tmp_autoround
+```
 ## API Usage (Gaudi2/CPU/GPU)
 ```python
 from auto_round import AutoRoundMLLM
@@ -21,7 +36,14 @@ output_dir = "./tmp_autoround"
 autoround.save_quantized(output_dir, format='auto_round', inplace=True)
 ```
 
-## Template
+### Dataset
+For mllm, we used liuhaotian/llava_conv_58k as our defalt calib datasets. Through command ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
+
+### Limitation
+So far, auto-round for mllm supports five model families, include Qwen2, Llama, Phi3v, Llava and CogVLM2.
+
+## New Models Support
+### Template
 For autoround MLLMs, using Template to customize different operations for different models. User can add a custom chat template through json file as below.
 ```json
 {
@@ -33,7 +55,9 @@ For autoround MLLMs, using Template to customize different operations for differ
     "format_separator": "\n",
     "default_system": "You are a helpful assistant.",
     "replace_tokens": ["<image>", "<|vision_start|><|image_pad|><|vision_end|>"],
-    "processor": "qwen2_vl" }
+    "extra_encode": "True",
+    "processor": "qwen2_vl" 
+}
 ```
 The special token ```{{content}}``` is a placeholder to tell the preprocessor where to fill in the corresponding dialogue content.
 
@@ -45,5 +69,5 @@ For example, the input conversations:<br>
 Using the above template, the input will be converted to the specified format required by Qwen2-vl as below: <br>
  ```'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat are the colors of the bus in the image?<|im_end|>\n<|im_start|>assistant\nThe bus in the image is white and red.<|im_end|>\n<|im_start|>user\nWhat feature can be seen on the back of the bus?<|im_end|>\n<|im_start|>assistant\nThe back of the bus features an advertisement.<|im_end|>\n<|im_start|>user\nIs the bus driving down the street or pulled off to the side?<|im_end|>\n<|im_start|>assistant\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\n'```.
 
-## Processor
+### Processor
 Processor is callback interface for calling different processors, such as texts or images processors, for MLLMs. User can define own processor and use registration function to declare. For more information, please refer to the relevant code in ```auto_round/mllm/processor.py```.
\ No newline at end of file
diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py
index 817174b8..09014b53 100644
--- a/auto_round/mllm/eval.py
+++ b/auto_round/mllm/eval.py
@@ -348,6 +348,11 @@ def lmms_eval(
         use_cache=None,
         apply_chat_template=False
         ):
+    try:
+        from auto_round import AutoRoundConfig
+    except:
+        from auto_round.auto_quantizer import AutoHfQuantizer
+
     if isinstance(tasks, str):
         tasks = tasks.replace(' ', '').split(',') 
 

From 3115f9670115cb606f894987128a14bff7b0d0cf Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 13 Nov 2024 03:09:21 -0500
Subject: [PATCH 07/12] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py    | 7 ++-----
 auto_round/mllm/README.md | 5 ++---
 auto_round/mllm/eval.py   | 7 +++----
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 8815c8c0..fa2d2ff9 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -41,11 +41,8 @@ def run_mllm():
         tune(args)
 
 def run_lmms():
-    try:
-        import importlib
-        importlib.import_module("lmms_eval")
-    except:
-        raise ImportError("please install the lmms_eval firt.")
+    from transformers.utils.versions import require_version
+    require_version("lmms_eval", "please install the lmms_eval firt.")
     # from auto_round.script.lmms_eval import setup_lmms_args, eval
     from auto_round.script.mllm import setup_lmms_parser, lmms_eval
     args = setup_lmms_parser()
diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index d55236b2..17370354 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -1,6 +1,6 @@
 # AutoRound for MLLMs
 ## Basic Usage (Gaudi2/CPU/GPU)
-A user guide detailing the full list of supported arguments is provided by calling ```auto-round -h``` on the terminal.Alternatively, you can use ```auto_round``` instead of ```auto-round```. Set the format you want in `format` and
+A user guide detailing the full list of supported arguments is provided by calling ```auto-round-mllm -h``` on the terminal.Alternatively, you can use ```auto_round_mllm``` instead of ```auto-round-mllm```. Set the format you want in `format` and
 multiple formats exporting has been supported.
 
 ```bash
@@ -8,7 +8,6 @@ auto—round-mllm \
     --model Qwen/Qwen2-VL-2B-Instruct\
     --bits 4 \
     --batch_size 1 \
-    --nsamples 128 \
     --gradient_accumulate_steps 4 \
     --group_size 128 \
     --format "auto_round" \
@@ -37,7 +36,7 @@ autoround.save_quantized(output_dir, format='auto_round', inplace=True)
 ```
 
 ### Dataset
-For mllm, we used liuhaotian/llava_conv_58k as our defalt calib datasets. Through command ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
+For mllm, we used liuhaotian/llava_conv_58k as our defalt calib datasets. Through argument ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
 
 ### Limitation
 So far, auto-round for mllm supports five model families, include Qwen2, Llama, Phi3v, Llava and CogVLM2.
diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py
index 09014b53..3997e251 100644
--- a/auto_round/mllm/eval.py
+++ b/auto_round/mllm/eval.py
@@ -348,10 +348,7 @@ def lmms_eval(
         use_cache=None,
         apply_chat_template=False
         ):
-    try:
-        from auto_round import AutoRoundConfig
-    except:
-        from auto_round.auto_quantizer import AutoHfQuantizer
+    from auto_round import AutoRoundConfig
 
     if isinstance(tasks, str):
         tasks = tasks.replace(' ', '').split(',') 
@@ -379,6 +376,8 @@ def lmms_eval(
         model_args = f"model_id_name={model}"
     else:
         model_args = f"pretrained={model}"
+    if MODEL_TYPE_TO_LMMS_MODEL[model_type] == "llama_vision":
+        model_args += f",device_map={device}"
     results = _lmms_eval.evaluator.simple_evaluate(
         model=MODEL_TYPE_TO_LMMS_MODEL[model_type],
         model_args=model_args,

From af95be2d438ed63c10e71729ad2cb9f86aafb858 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Wed, 13 Nov 2024 03:30:50 -0500
Subject: [PATCH 08/12] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index fa2d2ff9..53490a01 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -42,7 +42,7 @@ def run_mllm():
 
 def run_lmms():
     from transformers.utils.versions import require_version
-    require_version("lmms_eval", "please install the lmms_eval firt.")
+    require_version("lmms_eval", "lmms_eval need to be installed, `pip install lmms_eval`")
     # from auto_round.script.lmms_eval import setup_lmms_args, eval
     from auto_round.script.mllm import setup_lmms_parser, lmms_eval
     args = setup_lmms_parser()

From 78dc6390035b1111124ea1f720c7e94682841c89 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 14 Nov 2024 02:44:15 -0500
Subject: [PATCH 09/12] add matrix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/mllm/README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index f226f270..f045686a 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -38,8 +38,16 @@ autoround.save_quantized(output_dir, format='auto_round', inplace=True)
 ### Dataset
 For mllm, we used liuhaotian/llava_conv_58k as our default calib datasets. Through argument ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
 
-### Limitation
-So far, auto-round for mllm supports five model families, include Qwen2, Llama, Phi3v, Llava and CogVLM2.
+### Support Matrix
+So far, auto-round for mllm supports five model families, include Qwen2-VL, Llama-Vision, Phi3-Vision, Llava-v1.5 and CogVLM2.
+
+|Model          |Eval Lib   |calibration dataset|quant nontext module|
+|---------------|-----------|-------------------|--------------------|
+|Qwen2-VL       |vlmeval    |pile/llava         |✔                   |
+|Llama-Vision   |lmms_eval  |llava              |✔                   |
+|Phi3-Vision    |vlmeval    |pile/llava         |✔                   |
+|Llava-v1.5     |lmms_eval  |pile/llava         |-                   |
+|CogVLM2        |lmms_eval  |pile/llava         |✔                   |
 
 ## New Models Support
 ### Template

From f1bcee36ddcc49e41a27b5e7bc506d101a9b07f9 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 14 Nov 2024 20:34:00 -0500
Subject: [PATCH 10/12] fix

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/mllm/README.md | 1 +
 auto_round/mllm/eval.py   | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index f045686a..629e1c25 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -4,6 +4,7 @@ A user guide detailing the full list of supported arguments is provided by calli
 multiple formats exporting has been supported.
 
 ```bash
+## experimental feature, default hyperparameters may be changed later
 auto—round-mllm \
     --model Qwen/Qwen2-VL-2B-Instruct\
     --bits 4 \
diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py
index 3997e251..887286de 100644
--- a/auto_round/mllm/eval.py
+++ b/auto_round/mllm/eval.py
@@ -378,6 +378,9 @@ def lmms_eval(
         model_args = f"pretrained={model}"
     if MODEL_TYPE_TO_LMMS_MODEL[model_type] == "llama_vision":
         model_args += f",device_map={device}"
+    class CliArgs:
+        output_path = output_dir
+
     results = _lmms_eval.evaluator.simple_evaluate(
         model=MODEL_TYPE_TO_LMMS_MODEL[model_type],
         model_args=model_args,
@@ -389,6 +392,7 @@ def lmms_eval(
         device=device,
         use_cache=use_cache,
         apply_chat_template=apply_chat_template,
+        cli_args=CliArgs()
     )
 
     # print and save result

From e24fd9279b60f1c7b9c69719d5acb10b5a948be0 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 14 Nov 2024 20:52:42 -0500
Subject: [PATCH 11/12] update

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/mllm/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index 629e1c25..e8043f6e 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -4,7 +4,7 @@ A user guide detailing the full list of supported arguments is provided by calli
 multiple formats exporting has been supported.
 
 ```bash
-## experimental feature, default hyperparameters may be changed later
+# experimental feature, default hyperparameters may be changed later
 auto—round-mllm \
     --model Qwen/Qwen2-VL-2B-Instruct\
     --bits 4 \

From 7667d6252fd9193ac4334bdcc2fce4916a1b2f9a Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 14 Nov 2024 21:05:10 -0500
Subject: [PATCH 12/12] pylint

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 auto_round/auto_quantizer.py | 2 +-
 auto_round/mllm/README.md    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/auto_quantizer.py b/auto_round/auto_quantizer.py
index 7cbcd2ea..46f41fc2 100644
--- a/auto_round/auto_quantizer.py
+++ b/auto_round/auto_quantizer.py
@@ -536,7 +536,7 @@ def remove_str(input_string: str, sub_str) -> str:
                 )
             if "gptq" in layer_backend and "exllamav2" in layer_backend:
                 try:
-                    from exllamav2_kernels import gemm_half_q_half, make_q_matrix
+                    from exllamav2_kernels import gemm_half_q_half, make_q_matrix  # pylint: disable=E0611
                 except:
                     logger.warning_once(
                         "For better inference performance, please install exllamav2 kernel "
diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
index e8043f6e..cd342f70 100644
--- a/auto_round/mllm/README.md
+++ b/auto_round/mllm/README.md
@@ -44,7 +44,7 @@ So far, auto-round for mllm supports five model families, include Qwen2-VL, Llam
 
 |Model          |Eval Lib   |calibration dataset|quant nontext module|
 |---------------|-----------|-------------------|--------------------|
-|Qwen2-VL       |vlmeval    |pile/llava         |✔                   |
+|Qwen2-VL       |vlmeval    |pile/llava         |-                   |
 |Llama-Vision   |lmms_eval  |llava              |✔                   |
 |Phi3-Vision    |vlmeval    |pile/llava         |✔                   |
 |Llava-v1.5     |lmms_eval  |pile/llava         |-                   |