Merge branch 'master' into refactor-finetune

zjgemi · Jul 31, 2024 · fdefc70 · fdefc70
2 parents 3a45d4c + f5c5d95
commit fdefc70
Show file tree

Hide file tree

Showing 22 changed files with 2,162 additions and 13 deletions.
diff --git a/.git_archival.txt b/.git_archival.txt
@@ -1,4 +1,3 @@
 node: $Format:%H$
 node-date: $Format:%cI$
 describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$
-ref-names: $Format:%D$
diff --git a/.github/workflows/pub-docker.yml b/.github/workflows/pub-docker.yml
@@ -25,13 +25,13 @@ jobs:
  uses: actions/checkout@v4
 
  - name: Log in to Docker Hub
- uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446
+ uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
  with:
  username: ${{ secrets.DOCKER_USERNAME }}
  password: ${{ secrets.DOCKER_PASSWORD }}
 
  - name: Log in to the Container registry
- uses: docker/login-action@0d4c9c5ea7693da7b068278f7b52bda2a190a446
+ uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
  with:
  registry: ghcr.io
  username: ${{ github.actor }}
@@ -46,7 +46,7 @@ jobs:
  ghcr.io/deepmodeling/dpgen2
 
  - name: Build and push Docker images
- uses: docker/build-push-action@c382f710d39a5bb4e430307530a720f50c2d3318
+ uses: docker/build-push-action@5176d81f87c23d6fc96624dfdbcd9f3830bbe445
  with:
  context: .
  push: true

diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py
@@ -46,6 +46,7 @@ def dp_dist_train_args():
  doc_template_script = "File names of the template training script. It can be a `List[str]`, the length of which is the same as `numb_models`. Each template script in the list is used to train a model. Can be a `str`, the models share the same template training script. "
  dock_student_model_path = "The path of student model"
  doc_student_model_uri = "The URI of student model"
+ doc_optional_files = "Optional files for training"
 
  return [
  Argument(
@@ -67,6 +68,13 @@ def dp_dist_train_args():
  default=None,
  doc=doc_student_model_uri,
  ),
+ Argument(
+ "optional_files",
+ list,
+ optional=True,
+ default=None,
+ doc=doc_optional_files,
+ ),
  ]
 
 
@@ -76,6 +84,7 @@ def dp_train_args():
  doc_template_script = "File names of the template training script. It can be a `List[str]`, the length of which is the same as `numb_models`. Each template script in the list is used to train a model. Can be a `str`, the models share the same template training script. "
  doc_init_models_paths = "the paths to initial models"
  doc_init_models_uri = "The URI of initial models"
+ doc_optional_files = "Optional files for training"
 
  return [
  Argument(
@@ -105,6 +114,13 @@ def dp_train_args():
  default=None,
  doc=doc_init_models_uri,
  ),
+ Argument(
+ "optional_files",
+ list,
+ optional=True,
+ default=None,
+ doc=doc_optional_files,
+ ),
  ]
 
 

diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py
@@ -155,6 +155,7 @@ def make_concurrent_learning_op(
  cl_step_config: dict = default_config,
  upload_python_packages: Optional[List[os.PathLike]] = None,
  valid_data: Optional[S3Artifact] = None,
+ train_optional_files: Optional[List[str]] = None,
 ):
  if train_style in ("dp", "dp-dist"):
  prep_run_train_op = PrepRunDPTrain(
@@ -165,6 +166,7 @@ def make_concurrent_learning_op(
  run_config=run_train_config,
  upload_python_packages=upload_python_packages,
  valid_data=valid_data,
+ optional_files=train_optional_files,
  )
  else:
  raise RuntimeError(f"unknown train_style {train_style}")
@@ -441,6 +443,7 @@ def workflow_concurrent_learning(
  collect_data_config = config["step_configs"]["collect_data_config"]
  cl_step_config = config["step_configs"]["cl_step_config"]
  upload_python_packages = config.get("upload_python_packages", None)
+ train_optional_files = config["train"].get("optional_files", None)
 
  if train_style == "dp":
  init_models_paths = config["train"].get("init_models_paths", None)
@@ -490,6 +493,7 @@ def workflow_concurrent_learning(
  cl_step_config=cl_step_config,
  upload_python_packages=upload_python_packages,
  valid_data=valid_data,
+ train_optional_files=train_optional_files,
  )
  scheduler = make_naive_exploration_scheduler(config)
 

diff --git a/dpgen2/fp/__init__.py b/dpgen2/fp/__init__.py
@@ -3,6 +3,11 @@
  PrepFpOpAbacus,
  RunFpOpAbacus,
 )
+from .cp2k import (
+ FpOpCp2kInputs,
+ PrepFpOpCp2k,
+ RunFpOpCp2k,
+)
 from .deepmd import (
  DeepmdInputs,
  PrepDeepmd,
@@ -40,4 +45,9 @@
  "prep": PrepFpOpAbacus,
  "run": RunFpOpAbacus,
  },
+ "fpop_cp2k": {
+ "inputs": FpOpCp2kInputs,
+ "prep": PrepFpOpCp2k,
+ "run": RunFpOpCp2k,
+ },
 }
diff --git a/dpgen2/fp/abacus.py b/dpgen2/fp/abacus.py
@@ -199,7 +199,7 @@ def execute(
  sys = dpdata.LabeledSystem(str(workdir), fmt="abacus/relax")
  else:
  raise ValueError("Type of calculation %s not supported" % calculation)
- out_name = run_config.get("out", fp_default_out_data_name)
+ out_name = fp_default_out_data_name
  sys.to("deepmd/npy", workdir / out_name)
 
  return OPIO(
@@ -212,13 +212,6 @@ def execute(
  @staticmethod
  def args():
  doc_cmd = "The command of abacus"
- doc_out = (
- "The output dir name of labeled data. "
- "In `deepmd/npy` format provided by `dpdata`."
- )
  return [
  Argument("command", str, optional=True, default="abacus", doc=doc_cmd),
- Argument(
- "out", str, optional=True, default=fp_default_out_data_name, doc=doc_out
- ),
  ]
diff --git a/dpgen2/fp/cp2k.py b/dpgen2/fp/cp2k.py
@@ -0,0 +1,185 @@
+import os
+from pathlib import (
+ Path,
+)
+from typing import (
+ List,
+ Optional,
+)
+
+import dpdata
+from dargs import (
+ Argument,
+)
+from dflow.python import (
+ OP,
+ OPIO,
+ Artifact,
+ BigParameter,
+ OPIOSign,
+)
+
+try:
+ from fpop.cp2k import (
+ Cp2kInputs,
+ PrepCp2k,
+ RunCp2k,
+ )
+except ModuleNotFoundError:
+ Cp2kInputs = PrepCp2k = RunCp2k = object
+
+from ..constants import (
+ fp_default_out_data_name,
+)
+
+
+class FpOpCp2kInputs(Cp2kInputs): # type: ignore
+ @staticmethod
+ def args():
+ doc_inp_file = "The path to the user-submitted CP2K input file."
+ return [
+ Argument("inp_file", str, optional=False, doc=doc_inp_file),
+ ]
+
+
+class PrepFpOpCp2k(OP):
+ @classmethod
+ def get_input_sign(cls):
+ return OPIOSign(
+ {
+ "config": BigParameter(dict),
+ "type_map": List[str],
+ "confs": Artifact(List[Path]),
+ }
+ )
+
+ @classmethod
+ def get_output_sign(cls):
+ return OPIOSign(
+ {
+ "task_names": BigParameter(List[str]),
+ "task_paths": Artifact(List[Path]),
+ }
+ )
+
+ @OP.exec_sign_check
+ def execute(
+ self,
+ ip: OPIO,
+ ) -> OPIO:
+ confs = []
+ # remove atom types with 0 atom from type map
+ # for all atom types in the type map
+ for p in ip["confs"]:
+ for f in p.rglob("type.raw"):
+ system = f.parent
+ s = dpdata.System(system, fmt="deepmd/npy")
+ atom_numbs = []
+ atom_names = []
+ for numb, name in zip(s["atom_numbs"], s["atom_names"]): # type: ignore https://github.com/microsoft/pyright/issues/5620
+ if numb > 0:
+ atom_numbs.append(numb)
+ atom_names.append(name)
+ if atom_names != s["atom_names"]:
+ for i, t in enumerate(s["atom_types"]): # type: ignore https://github.com/microsoft/pyright/issues/5620
+ s["atom_types"][i] = atom_names.index(s["atom_names"][t]) # type: ignore https://github.com/microsoft/pyright/issues/5620
+ s.data["atom_numbs"] = atom_numbs
+ s.data["atom_names"] = atom_names
+ target = "output/%s" % system
+ s.to("deepmd/npy", target)
+ confs.append(Path(target))
+ else:
+ confs.append(system)
+ op_in = OPIO(
+ {
+ "inputs": ip["config"]["inputs"],
+ "type_map": ip["type_map"],
+ "confs": confs,
+ "prep_image_config": ip["config"].get("prep", {}),
+ }
+ )
+ op = PrepCp2k()
+ return op.execute(op_in) # type: ignore in the case of not importing fpop
+
+
+def get_run_type(lines: List[str]) -> Optional[str]:
+ for line in lines:
+ if "RUN_TYPE" in line:
+ return line.split()[-1]
+ return None
+
+
+class RunFpOpCp2k(OP):
+ @classmethod
+ def get_input_sign(cls):
+ return OPIOSign(
+ {
+ "config": BigParameter(dict),
+ "task_name": BigParameter(str),
+ "task_path": Artifact(Path),
+ }
+ )
+
+ @classmethod
+ def get_output_sign(cls):
+ return OPIOSign(
+ {
+ "log": Artifact(Path),
+ "labeled_data": Artifact(Path),
+ }
+ )
+
+ @OP.exec_sign_check
+ def execute(
+ self,
+ ip: OPIO,
+ ) -> OPIO:
+ run_config = ip["config"].get("run", {})
+ op_in = OPIO(
+ {
+ "task_name": ip["task_name"],
+ "task_path": ip["task_path"],
+ "backward_list": [],
+ "log_name": "output.log",
+ "run_image_config": run_config,
+ }
+ )
+ op = RunCp2k()
+ op_out = op.execute(op_in) # type: ignore in the case of not importing fpop
+ workdir = op_out["backward_dir"].parent
+
+ file_path = os.path.join(str(workdir), "output.log")
+
+ # convert the output to deepmd/npy format
+ with open(workdir / "input.inp", "r") as f:
+ lines = f.readlines()
+
+ # 获取 RUN_TYPE
+ run_type = get_run_type(lines)
+
+ if run_type == "ENERGY_FORCE":
+ sys = dpdata.LabeledSystem(file_path, fmt="cp2kdata/e_f")
+ elif run_type == "MD":
+ sys = dpdata.LabeledSystem(
+ str(workdir), cp2k_output_name="output.log", fmt="cp2kdata/md"
+ )
+ else:
+ raise ValueError(f"Type of calculation {run_type} not supported")
+
+ # out_name = run_config.get("out", fp_default_out_data_name)
+ out_name = fp_default_out_data_name
+ sys.to("deepmd/npy", workdir / out_name)
+
+ return OPIO(
+ {
+ "log": workdir / "output.log",
+ "labeled_data": workdir / out_name,
+ }
+ )
+
+ @staticmethod
+ def args():
+ doc_cmd = "The command of cp2k"
+ return [
+ Argument("command", str, optional=True, default="cp2k", doc=doc_cmd),
+ ]
diff --git a/dpgen2/op/run_dp_train.py b/dpgen2/op/run_dp_train.py
@@ -191,6 +191,7 @@ def get_input_sign(cls):
  "init_data": Artifact(NestedDict[Path]),
  "iter_data": Artifact(List[Path]),
  "valid_data": Artifact(List[Path], optional=True),
+ "optional_files": Artifact(List[Path], optional=True),
  }
  )
 
@@ -328,6 +329,10 @@ def clean_before_quit():
  with open(train_script_name, "w") as fp:
  json.dump(train_dict, fp, indent=4)
 
+ if ip["optional_files"] is not None:
+ for f in ip["optional_files"]:
+ Path(f.name).symlink_to(f)
+
  # train model
  command = _make_train_command(
  dp_command,

diff --git a/dpgen2/superop/prep_run_dp_train.py b/dpgen2/superop/prep_run_dp_train.py
@@ -64,6 +64,7 @@ def __init__(
  run_config: dict = normalize_step_dict({}),
  upload_python_packages: Optional[List[os.PathLike]] = None,
  valid_data: Optional[S3Artifact] = None,
+ optional_files: Optional[List[str]] = None,
  ):
  self._input_parameters = {
  "block_id": InputParameter(type=str, value=""),
@@ -119,6 +120,7 @@ def __init__(
  run_config=run_config,
  upload_python_packages=upload_python_packages,
  valid_data=valid_data,
+ optional_files=optional_files,
  )
 
  @property
@@ -151,6 +153,7 @@ def _prep_run_dp_train(
  run_config: dict = normalize_step_dict({}),
  upload_python_packages: Optional[List[os.PathLike]] = None,
  valid_data: Optional[S3Artifact] = None,
+ optional_files: Optional[List[str]] = None,
 ):
  prep_config = deepcopy(prep_config)
  run_config = deepcopy(run_config)
@@ -206,6 +209,9 @@ def _prep_run_dp_train(
  "init_data": train_steps.inputs.artifacts["init_data"],
  "iter_data": train_steps.inputs.artifacts["iter_data"],
  "valid_data": valid_data,
+ "optional_files": upload_artifact(optional_files)
+ if optional_files is not None
+ else None,
  },
  with_sequence=argo_sequence(
  argo_len(prep_train.outputs.parameters["task_names"]),