feat: Add eval-desc CLI command for descriptor evaluation with 3D output format (#4903)

Copilot · njzjz · pre-commit-ci[bot] · web-flow · commit 3f0087b48d8a · 2025-08-25T16:55:54.000Z
This PR implements a new command-line interface for evaluating descriptors using trained DeePMD models, addressing the feature request for making the `eval_descriptor` function available from the command line. ## Overview The new `dp eval-desc` command allows users to generate descriptor matrices from their models using a simple CLI interface, similar to the existing `dp test` command. ## Usage ```bash # Basic usage dp eval-desc -m model.pb -s /path/to/system # With custom output directory dp eval-desc -m model.pth -s /path/to/system -o my_descriptors # Using datafile with multiple systems dp eval-desc -m model.pb -f systems_list.txt -o desc_output # For multi-task models dp eval-desc -m model.pth -s system_dir --head task_branch ``` ## Output Format Descriptors are saved as NumPy `.npy` files in 3D format (nframes, natoms, ndesc) preserving the natural structure of the data with separate dimensions for frames, atoms, and descriptor components. This format maintains the original data organization and is suitable for various analysis workflows. ## Implementation Details The implementation follows the same architectural pattern as the existing `dp test` command: - **CLI Parser**: Added argument parser in `deepmd/main.py` with options for model (`-m`), system (`-s`), datafile (`-f`), output (`-o`), and model branch (`--head`) - **Command Routing**: Integrated into the entrypoints system in `deepmd/entrypoints/main.py` - **Core Functionality**: New `eval_desc.py` module that uses `DeepEval.eval_descriptor()` to generate descriptors and saves them as `.npy` files in their natural 3D format - **Documentation**: Updated user guide and API documentation with output format details - **Testing**: Comprehensive tests following the pattern of existing `dp test` functionality Fixes #4503.  --- ✨ Let Copilot coding agent [set things up for you](https://github.com/deepmodeling/deepmd-kit/issues/new?title=✨+Set+up+Copilot+instructions&body=Configure%20instructions%20for%20this%20repository%20as%20documented%20in%20%5BBest%20practices%20for%20Copilot%20coding%20agent%20in%20your%20repository%5D%28https://gh.io/copilot-coding-agent-tips%29%2E%0A%0A%3COnboard%20this%20repo%3E&assignees=copilot) — coding agent works faster and does higher quality work when set up for your repo. --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: njzjz <9496702+njzjz@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/deepmd/entrypoints/eval_desc.py b/deepmd/entrypoints/eval_desc.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Evaluate descriptors using trained DeePMD model."""
+
+import logging
+import os
+from pathlib import (
+    Path,
+)
+from typing import (
+    Optional,
+)
+
+import numpy as np
+
+from deepmd.common import (
+    expand_sys_str,
+)
+from deepmd.infer.deep_eval import (
+    DeepEval,
+)
+from deepmd.utils.data import (
+    DeepmdData,
+)
+
+__all__ = ["eval_desc"]
+
+log = logging.getLogger(__name__)
+
+
+def eval_desc(
+    *,
+    model: str,
+    system: str,
+    datafile: str,
+    output: str = "desc",
+    head: Optional[str] = None,
+    **kwargs,
+) -> None:
+    """Evaluate descriptors for given systems.
+
+    Parameters
+    ----------
+    model : str
+        path where model is stored
+    system : str
+        system directory
+    datafile : str
+        the path to the list of systems to process
+    output : str
+        output directory for descriptor files
+    head : Optional[str], optional
+        (Supported backend: PyTorch) Task head if in multi-task mode.
+    **kwargs
+        additional arguments
+
+    Notes
+    -----
+    Descriptors are saved as 3D numpy arrays with shape (nframes, natoms, ndesc)
+    where each frame contains the descriptors for all atoms.
+
+    Raises
+    ------
+    RuntimeError
+        if no valid system was found
+    """
+    if datafile is not None:
+        with open(datafile) as datalist:
+            all_sys = datalist.read().splitlines()
+    else:
+        all_sys = expand_sys_str(system)
+
+    if len(all_sys) == 0:
+        raise RuntimeError("Did not find valid system")
+
+    # init model
+    dp = DeepEval(model, head=head)
+
+    # create output directory
+    output_dir = Path(output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for cc, system_path in enumerate(all_sys):
+        log.info("# -------output of dp eval_desc------- ")
+        log.info(f"# processing system : {system_path}")
+
+        # create data class
+        tmap = dp.get_type_map()
+        data = DeepmdData(
+            system_path,
+            set_prefix="set",
+            shuffle_test=False,
+            type_map=tmap,
+            sort_atoms=False,
+        )
+
+        # get test data
+        test_data = data.get_test()
+        mixed_type = data.mixed_type
+        natoms = len(test_data["type"][0])
+        nframes = test_data["box"].shape[0]
+
+        # prepare input data
+        coord = test_data["coord"].reshape([nframes, -1])
+        box = test_data["box"]
+        if not data.pbc:
+            box = None
+        if mixed_type:
+            atype = test_data["type"].reshape([nframes, -1])
+        else:
+            atype = test_data["type"][0]
+
+        # handle optional parameters
+        fparam = None
+        if dp.get_dim_fparam() > 0:
+            if "fparam" in test_data:
+                fparam = test_data["fparam"]
+
+        aparam = None
+        if dp.get_dim_aparam() > 0:
+            if "aparam" in test_data:
+                aparam = test_data["aparam"]
+
+        # evaluate descriptors
+        log.info(f"# evaluating descriptors for {nframes} frames")
+        descriptors = dp.eval_descriptor(
+            coord,
+            box,
+            atype,
+            fparam=fparam,
+            aparam=aparam,
+        )
+
+        # descriptors are kept in 3D format (nframes, natoms, ndesc)
+
+        # save descriptors
+        system_name = os.path.basename(system_path.rstrip("/"))
+        desc_file = output_dir / f"{system_name}.npy"
+        np.save(desc_file, descriptors)
+
+        log.info(f"# descriptors saved to {desc_file}")
+        log.info(f"# descriptor shape: {descriptors.shape}")
+        log.info("# ----------------------------------- ")
+
+    log.info("# eval_desc completed successfully")
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
@@ -18,6 +18,9 @@
 from deepmd.entrypoints.doc import (
     doc_train_input,
 )
+from deepmd.entrypoints.eval_desc import (
+    eval_desc,
+)
 from deepmd.entrypoints.gui import (
     start_dpgui,
 )
@@ -65,6 +68,14 @@ def main(args: argparse.Namespace) -> None:
             strict_prefer=False,
         )
         test(**dict_args)
+    elif args.command == "eval-desc":
+        dict_args["model"] = format_model_suffix(
+            dict_args["model"],
+            feature=Backend.Feature.DEEP_EVAL,
+            preferred_backend=args.backend,
+            strict_prefer=False,
+        )
+        eval_desc(**dict_args)
     elif args.command == "doc-train-input":
         doc_train_input(**dict_args)
     elif args.command == "model-devi":
diff --git a/deepmd/main.py b/deepmd/main.py
@@ -416,6 +416,56 @@ def main_parser() -> argparse.ArgumentParser:
         help="(Supported backend: PyTorch) Task head (alias: model branch) to test if in multi-task mode.",
     )
 
+    # * eval_desc script ***************************************************************
+    parser_eval_desc = subparsers.add_parser(
+        "eval-desc",
+        parents=[parser_log],
+        help="evaluate descriptors using the model",
+        formatter_class=RawTextArgumentDefaultsHelpFormatter,
+        epilog=textwrap.dedent(
+            """\
+        examples:
+            dp eval-desc -m graph.pb -s /path/to/system -o desc
+        """
+        ),
+    )
+    parser_eval_desc.add_argument(
+        "-m",
+        "--model",
+        default="frozen_model",
+        type=str,
+        help="Frozen model file (prefix) to import. TensorFlow backend: suffix is .pb; PyTorch backend: suffix is .pth.",
+    )
+    parser_eval_desc_subgroup = parser_eval_desc.add_mutually_exclusive_group()
+    parser_eval_desc_subgroup.add_argument(
+        "-s",
+        "--system",
+        default=".",
+        type=str,
+        help="The system dir. Recursively detect systems in this directory",
+    )
+    parser_eval_desc_subgroup.add_argument(
+        "-f",
+        "--datafile",
+        default=None,
+        type=str,
+        help="The path to the datafile, each line of which is a path to one data system.",
+    )
+    parser_eval_desc.add_argument(
+        "-o",
+        "--output",
+        default="desc",
+        type=str,
+        help="Output directory for descriptor files. Descriptors will be saved as desc/(system_name).npy",
+    )
+    parser_eval_desc.add_argument(
+        "--head",
+        "--model-branch",
+        default=None,
+        type=str,
+        help="(Supported backend: PyTorch) Task head (alias: model branch) to use if in multi-task mode.",
+    )
+
     # * compress model *****************************************************************
     # Compress a model, which including tabulating the embedding-net.
     # The table is composed of fifth-order polynomial coefficients and is assembled
@@ -909,6 +959,7 @@ def main(args: Optional[list[str]] = None) -> None:
 
     if args.command in (
         "test",
+        "eval-desc",
         "doc-train-input",
         "model-devi",
         "neighbor-stat",
diff --git a/doc/inference/python.md b/doc/inference/python.md
@@ -19,6 +19,21 @@ e, f, v = dp.eval(coord, cell, atype)
 
 where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
 
+One can also evaluate the descriptors of the model:
+
+```python
+from deepmd.infer import DeepPot
+import numpy as np
+
+dp = DeepPot("graph.pb")
+coord = np.array([[1, 0, 0], [0, 0, 1.5], [1, 0, 3]]).reshape([1, -1])
+cell = np.diag(10 * np.ones(3)).reshape([1, -1])
+atype = [1, 0, 1]
+descriptors = dp.eval_descriptor(coord, cell, atype)
+```
+
+where `descriptors` is the descriptor matrix of the system. This can also be done using the command line interface `dp eval-desc` as described in the [test documentation](../test/test.md).
+
 Furthermore, one can use the python interface to calculate model deviation.
 
 ```python
diff --git a/doc/test/test.md b/doc/test/test.md
@@ -17,3 +17,25 @@ An explanation will be provided
 ```{program-output} dp test -h
 
 ```
+
+## Evaluate descriptors
+
+The descriptors of a model can be evaluated and saved using `dp eval-desc`. A typical usage of `dp eval-desc` is
+
+```bash
+dp eval-desc -m graph.pb -s /path/to/system -o desc
+```
+
+where `-m` gives the model file, `-s` the path to the system directory (or `-f` for a datafile containing paths to systems), and `-o` the output directory where descriptor files will be saved. The descriptors for each system will be saved as `.npy` files with the format `desc/(system_name).npy`. Each descriptor file contains a 3D array with shape (nframes, natoms, ndesc).
+
+Several other command line options can be passed to `dp eval-desc`, which can be checked with
+
+```bash
+$ dp eval-desc --help
+```
+
+An explanation will be provided
+
+```{program-output} dp eval-desc -h
+
+```
diff --git a/source/tests/pt/test_eval_desc.py b/source/tests/pt/test_eval_desc.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import torch
+
+from deepmd.entrypoints.eval_desc import (
+    eval_desc,
+)
+from deepmd.pt.entrypoints.main import (
+    get_trainer,
+)
+
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+
+
+class DPEvalDesc:
+    def test_dp_eval_desc_1_frame(self) -> None:
+        trainer = get_trainer(deepcopy(self.config))
+        with torch.device("cpu"):
+            input_dict, label_dict, _ = trainer.get_data(is_train=False)
+        has_spin = getattr(trainer.model, "has_spin", False)
+        if callable(has_spin):
+            has_spin = has_spin()
+        if not has_spin:
+            input_dict.pop("spin", None)
+        input_dict["do_atomic_virial"] = True
+        result = trainer.model(**input_dict)
+        model = torch.jit.script(trainer.model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
+        torch.jit.save(model, tmp_model.name)
+
+        # Test eval_desc
+        eval_desc(
+            model=tmp_model.name,
+            system=self.config["training"]["validation_data"]["systems"][0],
+            datafile=None,
+            output=self.output_dir,
+        )
+        os.unlink(tmp_model.name)
+
+        # Check that descriptor file was created
+        system_name = os.path.basename(
+            self.config["training"]["validation_data"]["systems"][0].rstrip("/")
+        )
+        desc_file = os.path.join(self.output_dir, f"{system_name}.npy")
+        self.assertTrue(os.path.exists(desc_file))
+
+        # Load and validate descriptor
+        descriptors = np.load(desc_file)
+        self.assertIsInstance(descriptors, np.ndarray)
+        # Descriptors should be 3D: (nframes, natoms, ndesc)
+        self.assertEqual(len(descriptors.shape), 3)  # Should be 3D array
+        self.assertGreater(descriptors.shape[0], 0)  # Should have frames
+        self.assertGreater(descriptors.shape[1], 0)  # Should have atoms
+        self.assertGreater(descriptors.shape[2], 0)  # Should have descriptor dimensions
+
+    def tearDown(self) -> None:
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pt"):
+                os.remove(f)
+            if f in ["lcurve.out", self.input_json]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+        # Clean up output directory
+        if hasattr(self, "output_dir") and os.path.exists(self.output_dir):
+            shutil.rmtree(self.output_dir)
+
+
+class TestDPEvalDescSeA(DPEvalDesc, unittest.TestCase):
+    def setUp(self) -> None:
+        self.output_dir = "test_eval_desc_output"
+        input_json = str(Path(__file__).parent / "water" / "se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        data_file = [str(Path(__file__).parent / "water" / "data" / "single")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.input_json = "test_eval_desc.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+
+if __name__ == "__main__":
+    unittest.main()