Generate _ModelInfo properties file when loading to improve loading speed (vllm-project#23558)

manoelmarques · hmellor · ProExpertProg · xuebwang-amd · commit 875493f060df · 2025-10-24T09:19:19.000Z
Signed-off-by: Manoel Marques &lt;manoel.marques@ibm.com&gt;
Signed-off-by: Manoel Marques &lt;manoelmrqs@gmail.com&gt;
Co-authored-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
Co-authored-by: Luka Govedič &lt;ProExpertProg@users.noreply.github.com&gt;
Signed-off-by: xuebwang-amd &lt;xuebwang@amd.com&gt;
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
@@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "logtime",
 ]
diff --git a/vllm/logging_utils/log_time.py b/vllm/logging_utils/log_time.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Provides a timeslice logging decorator
+"""
+
+import functools
+import time
+
+
+def logtime(logger, msg=None):
+    """
+    Logs the execution time of the decorated function.
+    Always place it beneath other decorators.
+    """
+
+    def _inner(func):
+
+        @functools.wraps(func)
+        def _wrapper(*args, **kwargs):
+            start = time.perf_counter()
+            result = func(*args, **kwargs)
+            elapsed = time.perf_counter() - start
+
+            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
+                if msg is None else msg
+            logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
+            return result
+
+        return _wrapper
+
+    return _inner
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -11,6 +11,7 @@
 import time
 from collections import defaultdict
 from collections.abc import Generator
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Optional, Union
 
@@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path],
     return lock
 
 
+@contextmanager
+def atomic_writer(filepath: Union[str, Path],
+                  mode: str = 'w',
+                  encoding: Optional[str] = None):
+    """
+    Context manager that provides an atomic file writing routine.
+
+    The context manager writes to a temporary file and, if successful,
+    atomically replaces the original file.
+
+    Args:
+        filepath (str or Path): The path to the file to write.
+        mode (str): The file mode for the temporary file (e.g., 'w', 'wb').
+        encoding (str): The encoding for text mode.
+
+    Yields:
+        file object: A handle to the temporary file.
+    """
+    # Create a temporary file in the same directory as the target file
+    # to ensure it's on the same filesystem for an atomic replace.
+    temp_dir = os.path.dirname(filepath)
+    temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir)
+
+    try:
+        # Open the temporary file for writing
+        with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file:
+            yield temp_file
+
+        # If the 'with' block completes successfully,
+        # perform the atomic replace.
+        os.replace(temp_path, filepath)
+
+    except Exception:
+        logger.exception(
+            "Error during atomic write. Original file '%s' not modified",
+            filepath)
+        raise
+    finally:
+        # Clean up the temporary file if it still exists.
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+
+
 def maybe_download_from_modelscope(
         model: str,
         revision: Optional[str] = None,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -4,24 +4,29 @@
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
+import hashlib
 import importlib
+import json
 import os
 import pickle
 import subprocess
 import sys
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from functools import lru_cache
+from pathlib import Path
 from typing import Callable, Optional, TypeVar, Union
 
 import torch.nn as nn
 import transformers
 
+from vllm import envs
 from vllm.config import (ModelConfig, iter_architecture_defaults,
                          try_match_architecture_defaults)
 from vllm.logger import init_logger
+from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
@@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
     module_name: str
     class_name: str
 
-    # Performed in another process to avoid initializing CUDA
+    @staticmethod
+    def _get_cache_dir() -> Path:
+        return Path(envs.VLLM_CACHE_ROOT) / "modelinfos"
+
+    def _get_cache_filename(self) -> str:
+        cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
+        return f"{cls_name}.json"
+
+    def _load_modelinfo_from_cache(self,
+                                   module_hash: str) -> _ModelInfo | None:
+        try:
+            try:
+                modelinfo_path = self._get_cache_dir(
+                ) / self._get_cache_filename()
+                with open(modelinfo_path, encoding="utf-8") as file:
+                    mi_dict = json.load(file)
+            except FileNotFoundError:
+                logger.debug(("Cached model info file "
+                              "for class %s.%s not found"), self.module_name,
+                             self.class_name)
+                return None
+
+            if mi_dict["hash"] != module_hash:
+                logger.debug(("Cached model info file "
+                              "for class %s.%s is stale"), self.module_name,
+                             self.class_name)
+                return None
+
+            # file not changed, use cached _ModelInfo properties
+            return _ModelInfo(**mi_dict["modelinfo"])
+        except Exception:
+            logger.exception(("Cached model info "
+                              "for class %s.%s error. "), self.module_name,
+                             self.class_name)
+            return None
+
+    def _save_modelinfo_to_cache(self, mi: _ModelInfo,
+                                 module_hash: str) -> None:
+        """save dictionary json file to cache"""
+        from vllm.model_executor.model_loader.weight_utils import atomic_writer
+        try:
+            modelinfo_dict = {
+                "hash": module_hash,
+                "modelinfo": asdict(mi),
+            }
+            cache_dir = self._get_cache_dir()
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            modelinfo_path = cache_dir / self._get_cache_filename()
+            with atomic_writer(modelinfo_path, encoding='utf-8') as f:
+                json.dump(modelinfo_dict, f, indent=2)
+        except Exception:
+            logger.exception("Error saving model info cache.")
+
+    @logtime(logger=logger, msg="Registry inspect model class")
     def inspect_model_cls(self) -> _ModelInfo:
-        return _run_in_subprocess(
+        model_path = Path(
+            __file__).parent / f"{self.module_name.split('.')[-1]}.py"
+
+        assert model_path.exists(), \
+            f"Model {self.module_name} expected to be on path {model_path}"
+        with open(model_path, "rb") as f:
+            module_hash = hashlib.md5(f.read()).hexdigest()
+
+        mi = self._load_modelinfo_from_cache(module_hash)
+        if mi is not None:
+            logger.debug(("Loaded model info "
+                          "for class %s.%s from cache"), self.module_name,
+                         self.class_name)
+            return mi
+        else:
+            logger.debug(("Cache model info "
+                          "for class %s.%s miss. "
+                          "Loading model instead."), self.module_name,
+                         self.class_name)
+
+        # Performed in another process to avoid initializing CUDA
+        mi = _run_in_subprocess(
             lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
+        logger.debug("Loaded model info for class %s.%s", self.module_name,
+                     self.class_name)
+
+        # save cache file
+        self._save_modelinfo_to_cache(mi, module_hash)
+
+        return mi
 
     def load_model_cls(self) -> type[nn.Module]:
         mod = importlib.import_module(self.module_name)

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,9 @@`
`2`	`2`	`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
`3`	`3`
`4`	`4`	`from vllm.logging_utils.formatter import NewLineFormatter`
	`5`	`+from vllm.logging_utils.log_time import logtime`
`5`	`6`
`6`	`7`	`__all__ = [`
`7`	`8`	`"NewLineFormatter",`
	`9`	`+ "logtime",`
`8`	`10`	`]`