From 12e934288860220233bcf09780d183331593b6c1 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 12 May 2022 18:50:21 -0400 Subject: [PATCH 1/7] refactor simplify 1. Used `dp model-devi` to calculate model deviation, instead of local calculation. Supported by deepmodeling/deepmd-kit#1618, released in v2.1.1. So the version earlier than 2.1.1 is not supported any more. 2. Assumed all systems are MultiSystems. 3. Removed energy model deviation support --- dpgen/simplify/simplify.py | 348 ++++++++++--------------------------- 1 file changed, 88 insertions(+), 260 deletions(-) diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index 982db3114..f8307ffc2 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -9,6 +9,7 @@ 02: fp (optional, if the original dataset do not have fp data, same as generator) """ import logging +import warnings import queue import os import json @@ -111,20 +112,13 @@ def init_pick(iter_index, jdata, mdata): """pick up init data from dataset randomly""" pick_data = jdata['pick_data'] init_pick_number = jdata['init_pick_number'] - use_clusters = jdata.get('use_clusters', False) # use MultiSystems with System # TODO: support System and LabeledSystem # TODO: support other format - if use_clusters: - systems = get_multi_system(pick_data, jdata) - else: - systems = get_systems(pick_data, jdata) + systems = get_multi_system(pick_data, jdata) # label the system labels = [] - if use_clusters: - items = systems.systems.items() - else: - items = systems.items() + items = systems.systems.items() for key, system in items: labels.extend([(key, j) for j in range(len(system))]) @@ -146,48 +140,18 @@ def init_pick(iter_index, jdata, mdata): _init_dump_selected_frames(systems, labels, rest_idx, sys_data_path, jdata) -def _add_system(systems, key, system): - if key in systems.keys(): - systems[key].append(system) - else: - systems[key] = system - return systems - - def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path, jdata): - pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) - if use_clusters: - selc_systems = dpdata.MultiSystems() - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems.append(systems[sys_name][sys_id]) - selc_systems.to_deepmd_raw(sys_data_path) - selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) - else: - selc_systems = {} - for j in selc_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, systems[sys_name][sys_id]) - sys_idx_map = get_system_idx(pick_data) - for kk in selc_systems.keys(): - sub_path = os.path.join(sys_data_path, sys_name_fmt % sys_idx_map[kk]) - selc_systems[kk].to_deepmd_raw(sub_path) - selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size) - with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp: - json.dump(sys_idx_map, fp, indent=4) - -def _dump_system_dict(systems, path): - for kk in systems: - sub_path = os.path.join(path, sys_name_fmt % (int(kk))) - systems[kk].to_deepmd_raw(sub_path) - systems[kk].to_deepmd_npy(sub_path, set_size=systems[kk].get_nframes()) + selc_systems = dpdata.MultiSystems() + for j in selc_idx: + sys_name, sys_id = labels[j] + selc_systems.append(systems[sys_name][sys_id]) + selc_systems.to_deepmd_raw(sys_data_path) + selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) def make_model_devi(iter_index, jdata, mdata): """calculate the model deviation of the rest idx""" pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) create_path(work_path) @@ -203,25 +167,7 @@ def make_model_devi(iter_index, jdata, mdata): rest_data_path = os.path.join(last_iter_name, model_devi_name, rest_data_name) if not os.path.exists(rest_data_path): return False - if use_clusters: - for jj, subsystem in enumerate(os.listdir(rest_data_path)): - task_name = "task." + model_devi_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.symlink(os.path.abspath(os.path.join(rest_data_path, subsystem)), - os.path.abspath(os.path.join(task_path, rest_data_name))) - else: - rest_data_path = os.path.abspath(rest_data_path) - sys_path = glob.glob(os.path.join(rest_data_path, sys_name_pattern)) - cwd = os.getcwd() - for ii in sys_path: - task_name = "task." + model_devi_task_fmt % (int(os.path.basename(ii).split('.')[1]), 0) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - os.chdir(task_path) - os.symlink(os.path.relpath(ii), rest_data_name) - os.chdir(cwd) - os.chdir(cwd) + os.symlink(os.path.abspath(rest_data_path), os.path.join(work_path, rest_data_name + ".old")) return True @@ -231,43 +177,28 @@ def run_model_devi(iter_index, jdata, mdata): work_path = os.path.join(iter_name, model_devi_name) # generate command commands = [] - tasks = glob.glob(os.path.join(work_path, "task.*")) - run_tasks = [os.path.basename(ii) for ii in tasks] + run_tasks = ["."] # get models models = glob.glob(os.path.join(work_path, "graph*pb")) model_names = [os.path.basename(ii) for ii in models] task_model_list = [] for ii in model_names: - task_model_list.append(os.path.join('..', ii)) - # get max data size - data_size = max([len(dpdata.System(os.path.join( - task, rest_data_name), fmt="deepmd/npy")) for task in tasks]) + task_model_list.append(os.path.join('.', ii)) # models commands = [] - detail_file_names = [] - for ii, mm in enumerate(task_model_list): - detail_file_name = "{prefix}-{ii}".format( - prefix=detail_file_name_prefix, - ii=ii, - ) - # TODO: support 0.x? - command = "{python} -m deepmd test -m {model} -s {system} -n {numb_test} -d {detail_file}".format( - python=mdata['python_test_path'], - model=mm, - system=rest_data_name, - numb_test=data_size, - detail_file=detail_file_name, - ) - commands.append(command) - detail_file_names.append(detail_file_name) + detail_file_name = detail_file_name_prefix + command = "{dp} model-devi -m {model} -s {system} -o {detail_file}".format( + dp=mdata.get('model_devi_command', 'dp'), + model=" ".join(task_model_list), + system=rest_data_name + ".old", + detail_file=detail_file_name, + ) + commands = [command] # submit - try: - model_devi_group_size = mdata['model_devi_group_size'] - except Exception: - model_devi_group_size = 1 + model_devi_group_size = mdata.get('model_devi_group_size', 1) - forward_files = [rest_data_name] - backward_files = sum([[pf+".e.out", pf+".f.out", pf+".v.out"] for pf in detail_file_names], []) + forward_files = [rest_data_name + ".old"] + backward_files = [detail_file_name] api_version = mdata.get('api_version', '0.9') if LooseVersion(api_version) < LooseVersion('1.0'): @@ -303,102 +234,50 @@ def run_model_devi(iter_index, jdata, mdata): def post_model_devi(iter_index, jdata, mdata): """calculate the model deviation""" - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) - tasks = glob.glob(os.path.join(work_path, "task.*")) - tasks.sort() - - e_trust_lo = jdata['e_trust_lo'] - e_trust_hi = jdata['e_trust_hi'] - f_trust_lo = jdata['f_trust_lo'] - f_trust_hi = jdata['f_trust_hi'] - - if use_clusters: - sys_accurate = dpdata.MultiSystems() - sys_candinate = dpdata.MultiSystems() - sys_failed = dpdata.MultiSystems() - else: - sys_accurate = {} - sys_candinate = {} - sys_failed = {} - all_names = set() - - for task in tasks: - if not use_clusters: - sys_name = os.path.basename(task).split('.')[1] - all_names.add(sys_name) - # e.out - details_e = glob.glob(os.path.join(task, "{}-*.e.out".format(detail_file_name_prefix))) - e_all = np.array([np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e]) - e_std = np.std(e_all, axis=0) - n_frame = e_std.size - - # f.out - details_f = glob.glob(os.path.join(task, "{}-*.f.out".format(detail_file_name_prefix))) - f_all = np.array([np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f]) - # (n_model, n_frame, n_atom, 3) - f_std = np.std(f_all, axis=0) - # (n_frame, n_atom, 3) - f_std = np.linalg.norm(f_std, axis=2) - # (n_frame, n_atom) - f_std = np.max(f_std, axis=1) - # (n_frame,) - - system_cls = get_system_cls(jdata) - for subsys, e_devi, f_devi in zip(system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std): - if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo) : - if use_clusters: + + f_trust_lo = jdata['model_devi_f_trust_lo'] + f_trust_hi = jdata['model_devi_f_trust_hi'] + + sys_accurate = dpdata.MultiSystems() + sys_candinate = dpdata.MultiSystems() + sys_failed = dpdata.MultiSystems() + + sys_entire = dpdata.MultiSystems().from_deepmd_npy(os.path.join(work_path, rest_data_name + ".old")) + + detail_file_name = detail_file_name_prefix + with open(os.path.join(work_path, detail_file_name)) as f: + for line in f: + if line.startswith("# data.rest.old"): + name = (line.split()[1]).split("/")[-1] + elif line.startswith("#"): + pass + else: + idx = int(line.split()[0]) + f_devi = float(line.split()[4]) + subsys = sys_entire[name][idx] + if f_trust_lo <= f_devi < f_trust_hi: sys_candinate.append(subsys) - else: - sys_candinate = _add_system(sys_candinate, sys_name, subsys) - elif (e_devi >= e_trust_hi ) or (f_devi >= f_trust_hi ): - if use_clusters: + elif f_devi >= f_trust_hi: sys_failed.append(subsys) - else: - sys_failed = _add_system(sys_failed, sys_name, subsys) - elif (e_devi < e_trust_lo and f_devi < f_trust_lo ): - if use_clusters: + elif f_devi < f_trust_lo: sys_accurate.append(subsys) else: - sys_accurate = _add_system(sys_accurate, sys_name, subsys) - else: - raise RuntimeError('reach a place that should NOT be reached...') - if use_clusters: - counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} - fp_sum = sum(counter.values()) - for cc_key, cc_value in counter.items(): - dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - all_names = list(all_names) - all_names.sort() - counter = {"candidate": 0, "accurate": 0, "failed": 0} - for kk in all_names: - sys_counter = {"candidate": 0, "accurate": 0, "failed": 0} - if kk in sys_candinate.keys(): - sys_counter['candidate'] += sys_candinate[kk].get_nframes() - if kk in sys_accurate.keys(): - sys_counter['accurate'] += sys_accurate[kk].get_nframes() - if kk in sys_failed.keys(): - sys_counter['failed'] += sys_failed[kk].get_nframes() - fp_sum = sum(sys_counter.values()) - for cc_key, cc_value in sys_counter.items(): - if fp_sum != 0: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) - else: - dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, 0*100)) - for ii in ['candidate', 'accurate', 'failed']: - counter[ii] += sys_counter[ii] + raise RuntimeError('reach a place that should NOT be reached...') + + counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()} + fp_sum = sum(counter.values()) + for cc_key, cc_value in counter.items(): + dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100)) if counter['candidate'] == 0 and counter['failed'] > 0: raise RuntimeError('no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi') # label the candidate system labels = [] - if use_clusters: - items = sys_candinate.systems.items() - else: - items = sys_candinate.items() + items = sys_candinate.systems.items() + for key, system in items: labels.extend([(key, j) for j in range(len(system))]) # candinate: pick up randomly @@ -412,112 +291,61 @@ def post_model_devi(iter_index, jdata, mdata): (counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.)) # dump the picked candinate data - if use_clusters: - picked_systems = dpdata.MultiSystems() - for j in pick_idx: - sys_name, sys_id = labels[j] - picked_systems.append(sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - picked_systems.to_deepmd_raw(sys_data_path) - picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) - else: - selc_systems = {} - for j in pick_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - sys_data_path = os.path.join(work_path, picked_data_name) - _dump_system_dict(selc_systems, sys_data_path) + picked_systems = dpdata.MultiSystems() + for j in pick_idx: + sys_name, sys_id = labels[j] + picked_systems.append(sys_candinate[sys_name][sys_id]) + sys_data_path = os.path.join(work_path, picked_data_name) + picked_systems.to_deepmd_raw(sys_data_path) + picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) + # dump the rest data (not picked candinate data and failed data) - if use_clusters: - rest_systems = dpdata.MultiSystems() - for j in rest_idx: - sys_name, sys_id = labels[j] - rest_systems.append(sys_candinate[sys_name][sys_id]) - rest_systems += sys_failed - sys_data_path = os.path.join(work_path, rest_data_name) - rest_systems.to_deepmd_raw(sys_data_path) + rest_systems = dpdata.MultiSystems() + for j in rest_idx: + sys_name, sys_id = labels[j] + rest_systems.append(sys_candinate[sys_name][sys_id]) + rest_systems += sys_failed + sys_data_path = os.path.join(work_path, rest_data_name) + rest_systems.to_deepmd_raw(sys_data_path) + if rest_idx.size: rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size) - else: - selc_systems = {} - for j in rest_idx: - sys_name, sys_id = labels[j] - selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) - for kk in sys_failed.keys(): - selc_systems = _add_system(selc_systems, kk, sys_failed[kk]) - sys_data_path = os.path.join(work_path, rest_data_name) - _dump_system_dict(selc_systems, sys_data_path) + # dump the accurate data -- to another directory - if use_clusters: - sys_data_path = os.path.join(work_path, accurate_data_name) - sys_accurate.to_deepmd_raw(sys_data_path) - sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) - else: - sys_data_path = os.path.join(work_path, accurate_data_name) - _dump_system_dict(sys_accurate, sys_data_path) + sys_data_path = os.path.join(work_path, accurate_data_name) + sys_accurate.to_deepmd_raw(sys_data_path) + sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) def make_fp_labeled(iter_index, jdata): dlog.info("already labeled, skip make_fp and link data directly") pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "task." + data_system_fmt % 0))) - os.symlink(os.path.abspath(picked_data_path), os.path.abspath( - os.path.join(work_path, "data." + data_system_fmt % 0))) - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - cwd = os.getcwd() - os.chdir(work_path) - for ii in sys_path: - sys_idx = os.path.basename(ii).split('.')[1] - data_dir = 'data.' + data_system_fmt % int(sys_idx) - task_dir = 'task.' + data_system_fmt % int(sys_idx) - os.symlink(os.path.relpath(ii), data_dir) - os.symlink(os.path.relpath(ii), task_dir) - os.chdir(cwd) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "task." + data_system_fmt % 0))) + os.symlink(os.path.abspath(picked_data_path), os.path.abspath( + os.path.join(work_path, "data." + data_system_fmt % 0))) def make_fp_configs(iter_index, jdata): pick_data = jdata['pick_data'] - use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, fp_name) create_path(work_path) picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name) - if use_clusters: - systems = get_multi_system(picked_data_path, jdata) - jj = 0 - for system in systems: - for subsys in system: - task_name = "task." + fp_task_fmt % (0, jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - jj += 1 - else: - picked_data_path = os.path.abspath(picked_data_path) - sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern)) - for ii in sys_path: - tmp_sys = dpdata.System(ii, fmt = 'deepmd/npy') - sys_idx = os.path.basename(ii).split('.')[1] - jj = 0 - for ss in tmp_sys: - task_name = "task." + fp_task_fmt % (int(sys_idx), jj) - task_path = os.path.join(work_path, task_name) - create_path(task_path) - ss.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) - job = {} - with open(os.path.join(task_path, 'job.json'), 'w') as fp: - json.dump(job, fp, indent=4) - jj += 1 + systems = get_multi_system(picked_data_path, jdata) + jj = 0 + for system in systems: + for subsys in system: + task_name = "task." + fp_task_fmt % (0, jj) + task_path = os.path.join(work_path, task_name) + create_path(task_path) + subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR')) + jj += 1 def make_fp_gaussian(iter_index, jdata): From 0ccf26c5e1eaec8f2748daf4a9102bc8546faa23 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 13 May 2022 03:16:50 -0400 Subject: [PATCH 2/7] expand path when getting multisystems --- dpgen/simplify/simplify.py | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index f8307ffc2..85b488337 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -44,10 +44,6 @@ def expand_sys_str(root_dir): for root, dirnames, filenames in os.walk(root_dir, followlinks=True): for filename in fnmatch.filter(filenames, 'type.raw'): matches.append(root) - matches.sort() - dirnames = [os.path.basename(ii) for ii in matches] - if (len(list(set(dirnames))) != len(matches)) : - raise RuntimeError('duplicated system name: it is highly recommend to place all systems in the same level of directory and has different names') return matches @@ -59,28 +55,12 @@ def get_system_cls(jdata): def get_multi_system(path, jdata): system = get_system_cls(jdata) + system_paths = expand_sys_str(path) systems = dpdata.MultiSystems( - *[system(os.path.join(path, s), fmt='deepmd/npy') for s in os.listdir(path)]) + *[system(s, fmt='deepmd/npy') for s in system_paths]) return systems -def get_systems(path, jdata): - system_cls = get_system_cls(jdata) - system_paths = expand_sys_str(path) - systems = {} - for ii in system_paths: - systems[os.path.basename(ii)] = system_cls(ii, fmt='deepmd/npy') - return systems - - -def get_system_idx(path): - system_paths = expand_sys_str(path) - sys_idx_map = {} - for idx,ii in enumerate(system_paths): - sys_idx_map[os.path.basename(ii)] = idx - return sys_idx_map - - def init_model(iter_index, jdata, mdata): training_init_model = jdata.get('training_init_model', False) if not training_init_model: From b545659d114d63ba68a16dd8b2928c2ae6f2f060 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 13 May 2022 03:50:15 -0400 Subject: [PATCH 3/7] let `make_train` and `run_train` expand paths --- README.md | 1 - dpgen/generator/run.py | 72 ++++++++++++++------------------------ dpgen/simplify/simplify.py | 9 +---- dpgen/util.py | 22 ++++++++++++ 4 files changed, 50 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index b59725ec9..7d57b5961 100644 --- a/README.md +++ b/README.md @@ -501,7 +501,6 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | init_data_prefix | String | "/sharedext4/.../data/" | Prefix of initial data directories | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. | ***sys_format*** | String | "vasp/poscar" | Format of initial data. It will be `vasp/poscar` if not set. - | init_multi_systems | Boolean | false | If set to `true`, `init_data_sys` directories should contain sub-directories of various systems. DP-GEN will regard all of these sub-directories as inital data systems. | init_batch_size | String of integer | [8] | Each number is the batch_size of corresponding system for training in `init_data_sys`. One recommended rule for setting the `sys_batch_size` and `init_batch_size` is that `batch_size` mutiply number of atoms ot the stucture should be larger than 32. If set to `auto`, batch size will be 32 divided by number of atoms. | | sys_configs_prefix | String | "/sharedext4/.../data/" | Prefix of `sys_configs` | **sys_configs** | List of list of string | [
["/sharedext4/.../POSCAR"],
["....../POSCAR"]
] | Containing directories of structures to be explored in iterations.Wildcard characters are supported here. | diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index dc7a91d3b..033cfb99d 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -60,7 +60,7 @@ from dpgen.generator.lib.ele_temp import NBandsEsti from dpgen.remote.decide_machine import convert_mdata from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from dpgen import ROOT_PATH from pymatgen.io.vasp import Incar,Kpoints,Potcar from dpgen.auto_test.lib.vasp import make_kspacing_kpoints @@ -287,13 +287,10 @@ def make_train (iter_index, # make sure all init_data_sys has the batch size -- for the following `zip` assert (len(init_data_sys_) <= len(init_batch_size_)) for ii, ss in zip(init_data_sys_, init_batch_size_) : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(work_path, 'data.init', ii)): - init_data_sys.append(os.path.join('..', 'data.init', ii, single_sys)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii, single_sys))) - else: - init_data_sys.append(os.path.join('..', 'data.init', ii)) - init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii))) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + init_data_sys.append(os.path.join('..', 'data.init', os.path.relpath(single_sys, ii))) + init_batch_size.append(detect_batch_size(ss, single_sys)) old_range = None if iter_index > 0 : for ii in range(iter_index) : @@ -307,25 +304,18 @@ def make_train (iter_index, sys_batch_size = ["auto" for aa in range(len(sys_list))] for jj in fp_data_sys : sys_idx = int(jj.split('.')[-1]) - if jdata.get('use_clusters', False): - nframes = 0 - for sys_single in os.listdir(jj): - tmp_box = np.loadtxt(os.path.join(jj, sys_single, 'box.raw')) - tmp_box = np.reshape(tmp_box, [-1,9]) - nframes += tmp_box.shape[0] - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - for sys_single in os.listdir(jj): - init_data_sys.append(os.path.join('..', 'data.iters', jj, sys_single)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], os.path.join(jj, sys_single))) - else: - nframes = dpdata.System(jj, 'deepmd/npy').get_nframes() - if nframes < fp_task_min : - log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) - continue - init_data_sys.append(os.path.join('..', 'data.iters', jj)) - init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], jj)) + sys_paths = expand_sys_str(jj) + nframes = 0 + for sys_single in sys_paths: + tmp_box = np.loadtxt(os.path.join(sys_single, 'box.raw')) + tmp_box = np.reshape(tmp_box, [-1,9]) + nframes += tmp_box.shape[0] + if nframes < fp_task_min : + log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) + continue + for sys_single in sys_paths: + init_data_sys.append(os.path.join('..', 'data.iters', sys_single)) + init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], sys_single)) # establish tasks jinput = jdata['default_training_param'] try: @@ -567,25 +557,17 @@ def run_train (iter_index, os.chdir(work_path) fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*')) for ii in init_data_sys : - if jdata.get('init_multi_systems', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) for ii in fp_data : - if jdata.get('use_clusters', False): - for single_sys in os.listdir(os.path.join(ii)): - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc')) - else: - trans_comm_data += glob.glob(os.path.join(ii, 'set.*')) - trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw')) - trans_comm_data += glob.glob(os.path.join(ii, 'nopbc')) + sys_paths = expand_sys_str(ii) + for single_sys in sys_paths: + trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw')) + trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc')) os.chdir(cwd) try: diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py index 85b488337..529401519 100644 --- a/dpgen/simplify/simplify.py +++ b/dpgen/simplify/simplify.py @@ -22,7 +22,7 @@ from dpgen import dlog from dpgen import SHORT_CMD -from dpgen.util import sepline +from dpgen.util import sepline, expand_sys_str from distutils.version import LooseVersion from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission from dpgen.generator.run import make_train, run_train, post_train, run_fp, post_fp, fp_name, model_devi_name, train_name, train_task_fmt, sys_link_fp_vasp_pp, make_fp_vasp_incar, make_fp_vasp_kp, make_fp_vasp_cp_cvasp, data_system_fmt, model_devi_task_fmt, fp_task_fmt @@ -39,13 +39,6 @@ sys_name_fmt = 'sys.' + data_system_fmt sys_name_pattern = 'sys.[0-9]*[0-9]' -def expand_sys_str(root_dir): - matches = [] - for root, dirnames, filenames in os.walk(root_dir, followlinks=True): - for filename in fnmatch.filter(filenames, 'type.raw'): - matches.append(root) - return matches - def get_system_cls(jdata): if jdata.get("labeled", False): diff --git a/dpgen/util.py b/dpgen/util.py index aa805e7e5..9491cdc30 100644 --- a/dpgen/util.py +++ b/dpgen/util.py @@ -1,5 +1,7 @@ #!/usr/bin/env python # coding: utf-8 +from typing import Union, List +from pathlib import Path from dpgen import dlog @@ -25,3 +27,23 @@ def box_center(ch='',fill=' ',sp="|"): ''' strs=ch.center(Len,fill) dlog.info(sp+strs[1:len(strs)-1:]+sp) + + +def expand_sys_str(root_dir: Union[str, Path]) -> List[str]: + """Recursively iterate over directories taking those that contain `type.raw` file. + + Parameters + ---------- + root_dir : Union[str, Path] + starting directory + + Returns + ------- + List[str] + list of string pointing to system directories + """ + root_dir = Path(root_dir) + matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()] + if (root_dir / "type.raw").is_file(): + matches.append(str(root_dir)) + return matches From 8e3c68cb7e4ac3cf95241f5766a145559b48ba0a Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 13 May 2022 03:56:02 -0400 Subject: [PATCH 4/7] load numpy array instead --- dpgen/generator/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index 033cfb99d..8018dc34f 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -307,7 +307,7 @@ def make_train (iter_index, sys_paths = expand_sys_str(jj) nframes = 0 for sys_single in sys_paths: - tmp_box = np.loadtxt(os.path.join(sys_single, 'box.raw')) + tmp_box = np.load(os.path.join(sys_single, 'box.npy')) tmp_box = np.reshape(tmp_box, [-1,9]) nframes += tmp_box.shape[0] if nframes < fp_task_min : From dbac9af763eac21f14ada4be1bf3eae5130de9d0 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 13 May 2022 03:57:27 -0400 Subject: [PATCH 5/7] use dpdata to get nframes --- dpgen/generator/run.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index 8018dc34f..becdafa34 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -307,9 +307,7 @@ def make_train (iter_index, sys_paths = expand_sys_str(jj) nframes = 0 for sys_single in sys_paths: - tmp_box = np.load(os.path.join(sys_single, 'box.npy')) - tmp_box = np.reshape(tmp_box, [-1,9]) - nframes += tmp_box.shape[0] + nframes += dpdata.LabeledSystem(sys_single, fmt="deepmd/npy").get_nframes() if nframes < fp_task_min : log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) continue From 05d1ac5354c888bff9cba73439fe129c7f25b10c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 13 May 2022 04:16:37 -0400 Subject: [PATCH 6/7] fix tests --- dpgen/generator/run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py index becdafa34..41b5aa299 100644 --- a/dpgen/generator/run.py +++ b/dpgen/generator/run.py @@ -287,9 +287,9 @@ def make_train (iter_index, # make sure all init_data_sys has the batch size -- for the following `zip` assert (len(init_data_sys_) <= len(init_batch_size_)) for ii, ss in zip(init_data_sys_, init_batch_size_) : - sys_paths = expand_sys_str(ii) + sys_paths = expand_sys_str(os.path.join(init_data_prefix, ii)) for single_sys in sys_paths: - init_data_sys.append(os.path.join('..', 'data.init', os.path.relpath(single_sys, ii))) + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.init', ii, os.path.relpath(single_sys, os.path.join(init_data_prefix, ii))))) init_batch_size.append(detect_batch_size(ss, single_sys)) old_range = None if iter_index > 0 : @@ -312,7 +312,7 @@ def make_train (iter_index, log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj)) continue for sys_single in sys_paths: - init_data_sys.append(os.path.join('..', 'data.iters', sys_single)) + init_data_sys.append(os.path.normpath(os.path.join('..', 'data.iters', sys_single))) init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], sys_single)) # establish tasks jinput = jdata['default_training_param'] From 301821cd3fbd6eb653b259accda50eda5b83ad80 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 30 May 2022 19:45:26 -0400 Subject: [PATCH 7/7] update README --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7d57b5961..c833ed059 100644 --- a/README.md +++ b/README.md @@ -499,7 +499,7 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key | **use_ele_temp** | int | 0 | Currently only support fp_style vasp. 0(default): no electron temperature. 1: eletron temperature as frame parameter. 2: electron temperature as atom parameter. | *#Data* | init_data_prefix | String | "/sharedext4/.../data/" | Prefix of initial data directories - | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. + | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. Systems will be detected recursively in the directories. | ***sys_format*** | String | "vasp/poscar" | Format of initial data. It will be `vasp/poscar` if not set. | init_batch_size | String of integer | [8] | Each number is the batch_size of corresponding system for training in `init_data_sys`. One recommended rule for setting the `sys_batch_size` and `init_batch_size` is that `batch_size` mutiply number of atoms ot the stucture should be larger than 32. If set to `auto`, batch size will be 32 divided by number of atoms. | | sys_configs_prefix | String | "/sharedext4/.../data/" | Prefix of `sys_configs` @@ -1085,7 +1085,6 @@ Here is an example of `param.json` for QM7 dataset: }, "_comment": "that's all" }, - "use_clusters": true, "fp_style": "gaussian", "shuffle_poscar": false, "fp_task_max": 1000, @@ -1108,7 +1107,7 @@ Here is an example of `param.json` for QM7 dataset: } ``` -Here `pick_data` is the data to simplify and currently only supports `MultiSystems` containing `System` with `deepmd/npy` format, and `use_clusters` should always be `true`. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. +Here `pick_data` is the directory to data to simplify where the program recursively detects systems `System` with `deepmd/npy` format. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator. ## Set up machine