deepmodeling · amcadmus · Oct 19, 2020 · Oct 7, 2020 · Oct 7, 2020 · Oct 8, 2020
diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,4 @@ dist
 .eggs
 _version.py
 venv*
+.vscode/**
diff --git a/README.md b/README.md
@@ -398,9 +398,10 @@ Since we do not have virial data, the virial prefactors `start_pref_v` and `limi
 An example of `training` is
 ```json
     "training" : {
-	"systems":	["../data/"],
+	"systems":	["../data1/", "../data2/"],
 	"set_prefix":	"set",    
 	"stop_batch":	1000000,
+	"_comment": " batch_size can be supplied with, e.g. 1, or auto (string) or [10, 20]"
 	"batch_size":	1,
 
 	"seed":		1,
@@ -409,6 +410,7 @@ An example of `training` is
 	"_comment": " frequencies counted in batch",
 	"disp_file":	"lcurve.out",
 	"disp_freq":	100,
+	"_comment": " numb_test can be supplied with, e.g. 1, or XX% (string) or [10, 20]"
 	"numb_test":	10,
 	"save_freq":	1000,
 	"save_ckpt":	"model.ckpt",
@@ -422,9 +424,10 @@ An example of `training` is
 ```
 The option **`systems`** provide location of the systems (path to `set.*` and `type.raw`). It is a vector, thus DeePMD-kit allows you to provide multiple systems. DeePMD-kit will train the model with the systems in the vector one by one in a cyclic manner. **It is warned that the example water data (in folder `examples/data/water`) is of very limited amount, is provided only for testing purpose, and should not be used to train a productive model.**
 
-The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size.
+The option **`batch_size`** specifies the number of frames in each batch. It can be set to `"auto"` to enable a automatic batch size or it can be input as a list setting batch size individually for each system.
 The option **`stop_batch`** specifies the total number of batches will be used in the training.
 
+The option **`numb_test`** specifies the number of tests that will be used for each system. If it is an integer each system will be tested with the same number of tests. It can be set to percentage `"XX%"` to use XX% of frames of each system for its testing or it can be input as a list setting numer of tests individually for each system (the order should correspond to ordering of the systems key in json).
 
 ### Training
 

diff --git a/source/tests/test_deepmd_data_sys.py b/source/tests/test_deepmd_data_sys.py
@@ -98,6 +98,7 @@ def test_get_test(self):
                                               -
                                               data['null']
         ), 0.0)
+
         sys_idx = 2
         data = ds.get_test(sys_idx=sys_idx)
         self.assertEqual(list(data['type'][0]), list(np.sort(self.atom_type[sys_idx])))

diff --git a/source/train/DataSystem.py b/source/train/DataSystem.py
@@ -65,6 +65,29 @@ def __init__ (self,
             type_map_list.append(self.data_systems[ii].get_type_map())
         self.type_map = self._check_type_map_consistency(type_map_list)
 
+        # ! altered by Marián Rynik
+        # test size
+        # now test size can be set as a percentage of systems data or test size
+        # can be set for each system individualy in the same manner as batch
+        # size. This enables one to use systems with diverse number of
+        # structures and different number of atoms.
+        self.test_size = test_size
+        if isinstance(self.test_size, int):
+            self.test_size = self.test_size * np.ones(self.nsystems, dtype=int)
+        elif isinstance(self.test_size, str):
+            words = self.test_size.split('%')
+            try:
+                percent = int(words[0])
+            except ValueError:
+                raise RuntimeError('unknown test_size rule ' + words[0])
+            self.test_size = self._make_auto_ts(percent)
+        elif isinstance(self.test_size, list):
+            pass
+        else :
+            raise RuntimeError('invalid test_size')            
+        assert(isinstance(self.test_size, (list,np.ndarray)))
+        assert(len(self.test_size) == self.nsystems)
+
         # prob of batch, init pick idx
         self.prob_nbatches = [ float(i) for i in self.nbatches] / np.sum(self.nbatches)        
         self.pick_idx = 0
@@ -75,10 +98,10 @@ def __init__ (self,
             if chk_ret is not None :
                 warnings.warn("system %s required batch size is larger than the size of the dataset %s (%d > %d)" % \
                               (self.system_dirs[ii], chk_ret[0], self.batch_size[ii], chk_ret[1]))
-            chk_ret = self.data_systems[ii].check_test_size(test_size)
+            chk_ret = self.data_systems[ii].check_test_size(self.test_size[ii])
             if chk_ret is not None :
                 warnings.warn("system %s required test size is larger than the size of the dataset %s (%d > %d)" % \
-                              (self.system_dirs[ii], chk_ret[0], test_size, chk_ret[1]))
+                              (self.system_dirs[ii], chk_ret[0], self.test_size[ii], chk_ret[1]))
 
 
     def _load_test(self, ntests = -1):
@@ -207,24 +230,34 @@ def get_batch (self,
         b_data["default_mesh"] = self.default_mesh[self.pick_idx]
         return b_data
 
+    # ! altered by Marián Rynik
     def get_test (self, 
-                  sys_idx = None, 
-                  ntests = -1) :
+                  sys_idx = None,
+                  n_test = -1) :
+
         if not hasattr(self, 'default_mesh') :
             self._make_default_mesh()
         if not hasattr(self, 'test_data') :
-            self._load_test(ntests = ntests)
+            self._load_test(ntests = n_test)
         if sys_idx is not None :
             idx = sys_idx
         else :
             idx = self.pick_idx
+
         test_system_data = {}
         for nn in self.test_data:
             test_system_data[nn] = self.test_data[nn][idx]
         test_system_data["natoms_vec"] = self.natoms_vec[idx]
         test_system_data["default_mesh"] = self.default_mesh[idx]
         return test_system_data
 
+    def get_sys_ntest(self, sys_idx=None):
+        """Get number of tests for the currently selected system,
+            or one defined by sys_idx."""
+        if sys_idx is not None :
+            return self.test_size[sys_idx]
+        else :
+            return self.test_size[self.pick_idx]
 
     def get_type_map(self):
         return self.type_map
@@ -261,20 +294,21 @@ def print_summary(self,
         # width 65
         sys_width = 42
         tmp_msg += "---Summary of DataSystem------------------------------------------------\n"
-        tmp_msg += "find %d system(s):\n" % self.nsystems
+        tmp_msg += "found %d system(s):\n" % self.nsystems
         tmp_msg += "%s  " % self._format_name_length('system', sys_width)
-        tmp_msg += "%s  %s  %s  %5s\n" % ('natoms', 'bch_sz', 'n_bch', 'prob')
+        tmp_msg += "%s  %s  %s   %s  %5s\n" % ('natoms', 'bch_sz', 'n_bch', "n_test", 'prob')
         for ii in range(self.nsystems) :
-            tmp_msg += ("%s  %6d  %6d  %5d  %5.3f\n" % 
+            tmp_msg += ("%s  %6d  %6d  %6d  %6d  %5.3f\n" % 
                         (self._format_name_length(self.system_dirs[ii], sys_width),
                          self.natoms[ii], 
-                         self.batch_size[ii], 
-                         self.nbatches[ii], 
+                         # TODO batch size * nbatches = number of structures
+                         self.batch_size[ii],
+                         self.nbatches[ii],
+                         self.test_size[ii],
                          prob[ii]) )
         tmp_msg += "------------------------------------------------------------------------\n"
         run_opt.message(tmp_msg)
 
-
     def _make_auto_bs(self, rule) :
         bs = []
         for ii in self.data_systems:
@@ -285,6 +319,16 @@ def _make_auto_bs(self, rule) :
             bs.append(bsi)
         return bs
 
+    # ! added by Marián Rynik
+    def _make_auto_ts(self, percent):
+        ts = []
+        for ii in range(self.nsystems):
+            ni = self.batch_size[ii] * self.nbatches[ii]
+            tsi = int(ni * percent / 100)
+            ts.append(tsi)
+
+        return ts
+
     def _check_type_map_consistency(self, type_map_list):
         ret = []
         for ii in type_map_list:

diff --git a/source/train/Trainer.py b/source/train/Trainer.py
@@ -169,8 +169,9 @@ def _init_param(self, jdata):
         # training
         training_param = j_must_have(jdata, 'training')
 
+        # ! first .add() altered by Marián Rynik
         tr_args = ClassArg()\
-                  .add('numb_test',     int,    default = 1)\
+                  .add('numb_test',     [int, list, str],    default = 1)\
                   .add('disp_file',     str,    default = 'lcurve.out')\
                   .add('disp_freq',     int,    default = 100)\
                   .add('save_freq',     int,    default = 1000)\
@@ -182,7 +183,8 @@ def _init_param(self, jdata):
                   .add('sys_probs',   list    )\
                   .add('auto_prob_style', str, default = "prob_sys_size")
         tr_data = tr_args.parse(training_param)
-        self.numb_test = tr_data['numb_test']
+        # not needed
+        # self.numb_test = tr_data['numb_test']
         self.disp_file = tr_data['disp_file']
         self.disp_freq = tr_data['disp_freq']
         self.save_freq = tr_data['save_freq']
@@ -458,17 +460,24 @@ def test_on_the_fly (self,
                          fp,
                          data,
                          feed_dict_batch) :
-        test_data = data.get_test(ntests = self.numb_test)
+        # ! altered by Marián Rynik
+        # Do not need to pass numb_test here as data object already knows it.
+        # Both DeepmdDataSystem and ClassArg parse the same json file
+        test_data = data.get_test(n_test=data.get_sys_ntest())
         feed_dict_test = {}
         for kk in test_data.keys():
             if kk == 'find_type' or kk == 'type' :
                 continue
             if 'find_' in kk:
                 feed_dict_test[self.place_holders[kk]] = test_data[kk]
             else:
-                feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk][:self.numb_test], [-1])
+                # ! altered by Marián Rynik
+                # again the data object knows appropriate test data shape,
+                # there is no need to slice again!
+                # feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk][:self.numb_test[data.pick_idx]], [-1])
+                feed_dict_test[self.place_holders[kk]] = np.reshape(test_data[kk], [-1])
         for ii in ['type'] :
-            feed_dict_test[self.place_holders[ii]] = np.reshape(test_data[ii][:self.numb_test], [-1])            
+            feed_dict_test[self.place_holders[ii]] = np.reshape(test_data[ii], [-1])            
         for ii in ['natoms_vec', 'default_mesh'] :
             feed_dict_test[self.place_holders[ii]] = test_data[ii]
         feed_dict_test[self.place_holders['is_training']] = False
@@ -483,6 +492,4 @@ def test_on_the_fly (self,
                                                      feed_dict_batch)
             print_str += "   %8.1e\n" % current_lr
             fp.write(print_str)
-            fp.flush ()
-
-
+            fp.flush ()
diff --git a/source/train/common.py b/source/train/common.py
@@ -16,7 +16,7 @@
 #     """
 #     cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
 #     return x * cdf
-def gelu(x) :
+def gelu(x):
     return op_module.gelu(x)
 
 data_requirement = {}
@@ -110,11 +110,17 @@ def add (self,
     def _add_single(self, key, data) :
         vtype = type(data)
         if not(vtype in self.arg_dict[key]['types']) :
-            # try the type convertion to the first listed type
-            try :
-                vv = (self.arg_dict[key]['types'][0])(data)
-            except TypeError:
-                raise TypeError ("cannot convert provided key \"%s\" to type %s " % (key, str(self.arg_dict[key]['types'][0])) )
+            # ! altered by Marián Rynik
+            # try the type convertion to one of the types
+            for tp in self.arg_dict[key]['types']:
+                try :
+                    vv = tp(data)
+                except TypeError:
+                    pass
+                else:
+                    break
+            else:
+                raise TypeError ("cannot convert provided key \"%s\" to type(s) %s " % (key, str(self.arg_dict[key]['types'])) )
         else :
             vv = data
         self.arg_dict[key]['value'] = vv
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,3 +23,4 @@ dist @@
     .eggs
     _version.py
     venv*
+    .vscode/**