Merge pull request #33 from iotcad/config

aucad · Mar 25, 2023 · c808566 · c808566
2 parents 943de18 + 214180a
commit c808566
Show file tree

Hide file tree

Showing 12 changed files with 188 additions and 82 deletions.
diff --git a/Makefile b/Makefile
@@ -29,8 +29,8 @@ T_ROBUST := --robust
 F_ROBUST :=
 
 ALWAYS := --resume
-IOT_OPTIONS := --validator IOT23
-NB_OPTIONS := --validator NB15
+IOT_OPTIONS := --validator IOT23 --config config/iot.yaml
+NB_OPTIONS := --validator NB15 --config config/unsw.yaml
 
 DS_1 := -d ./data/CTU.csv $(IOT_OPTIONS)
 DS_2 := -d ./data/nb15.csv $(NB_OPTIONS)
@@ -47,7 +47,7 @@ sample:
 	@$(foreach i, $(ITERS), $(foreach c, $(CLS), $(foreach r, $(ROBUST), \
 	$(foreach attack, $(ATTACKS), \
 	python3 -m src experiment $(ALWAYS) -a $(attack) $(DS_2) $($(r)) \
-		--iter 0 -s 50 -t 3  -c $(c) ; ))))
+		--iter 0 -s 50 -t 3 -c $(c) ; ))))
 
 fast:
 	@$(foreach r, $(ROBUST), $(foreach c, $(CLS), $(foreach attack, $(ATTACKS), \

diff --git a/config/default.yaml b/config/default.yaml
@@ -0,0 +1,54 @@
+hsj:
+  # Attack instance parameters
+  # https://adversarial-robustness-toolbox.readthedocs.io/en/latest/modules/attacks/evasion.html#hopskipjump-attack
+  # size of the batch used by the estimator during inference.
+  batch_size: 64
+  # Maximum number of evaluations for estimating gradient.
+  max_eval: 1000
+  # Initial number of evaluations for estimating gradient.
+  init_eval: 100
+  # Maximum number of trials for initial generation of adversarial examples.
+  init_size: 100
+zoo: # Attack instance parameters
+  # https://adversarial-robustness-toolbox.readthedocs.io/en/latest/modules/attacks/evasion.html#zeroth-order-optimization-zoo-attack
+  # Confidence of adversarial examples: a higher value produces examples that are farther away, from the original input,
+  # but classified with higher confidence as the target class.
+  confidence: 0.25
+  # The initial learning rate for the attack algorithm. Smaller values produce better results but are slower to
+  # converge.
+  learning_rate: 0.1
+  # Number of times to adjust constant with binary search (positive value).
+  binary_search_steps: 10
+  # The initial trade-off constant c to use to tune the relative importance of distance and confidence. If
+  # binary_search_steps is large, the initial constant is not important, as discussed in Carlini and Wagner (2016).
+  initial_const: 0.001
+  # True if gradient descent should be abandoned when it gets stuck.
+  abort_early: True
+  # True if to use the resizing strategy from the paper: first, compute attack on inputs resized to 32x32, then increase
+  # size if needed to 64x64, followed by 128x128.
+  use_resize: False
+  # True if to use importance sampling when choosing coordinates to update.
+  use_importance: False
+  # Step size for numerical estimation of derivatives.
+  variable_h: 0.3
+xgb: # Tree booster params
+  # <https://xgboost.readthedocs.io/en/stable/parameter.html#parameters-for-tree-booster>
+  # Step size shrinkage used in update to prevents overfitting.
+  eta: 0.3
+  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
+  gamma: 0
+  # Maximum depth of a tree
+  max_depth: 6
+  # Minimum sum of instance weight (hessian) needed in a child
+  min_child_weight: 1
+dnn:
+  # Keras model: https://keras.io/guides/sequential_model/
+  model:
+    # hidden layers
+    layers: [ 60 ]
+  # Keras model training args
+  model_fit:
+    # model training epochs
+    epochs: 80
+    # batch size
+    batch_size: 64
diff --git a/config/iot.yaml b/config/iot.yaml
@@ -0,0 +1,3 @@
+dnn:
+  model:
+    layers: [ 60 ]
diff --git a/config/unsw.yaml b/config/unsw.yaml
@@ -0,0 +1,3 @@
+dnn:
+  model:
+    layers: [ 60, 60, 60, 60, 60 ]
diff --git a/src/__main__.py b/src/__main__.py
@@ -16,8 +16,10 @@
 
 """
 import logging
+import yaml
 
 from argparse import ArgumentParser
+from pathlib import Path
 from sys import exit
 from typing import Optional, List
 
@@ -54,7 +56,12 @@ def main():
             args.validator, args.dataset, args.capture, args.out)
 
     if is_exp:
-        Experiment(utility.ts_str(), **args.__dict__).run()
+        df_args = yaml.safe_load(
+            Path(Experiment.DEFAULT_CF).read_text())
+        ex_args = yaml.safe_load(
+            Path(args.config).read_text()) if args.config else {}
+        c_args = {**df_args, **ex_args}
+        Experiment(utility.ts_str(), c_args, **args.__dict__).run()
 
 
 def init_logger(level: int, fn: str = None):
@@ -182,6 +189,12 @@ def exp_args(parser: ArgumentParser):
         action='store_true',
         help="disable console log"
     )
+    parser.add_argument(
+        '--config',
+        action="store",
+        default=None,
+        help=f'path to config file  [default: None]',
+    )
 
 
 def validator_args(parser: ArgumentParser):

diff --git a/src/attack.py b/src/attack.py
@@ -10,7 +10,8 @@
 class Attack:
     """Attack base class defines common functionality"""
 
-    def __init__(self, name, def_iter, validator, uuid, save, iters, silent):
+    def __init__(self, name, def_iter, validator, uuid, save, iters,
+                 silent, attack_conf):
         self.uuid = uuid
         self.name = name
         self.validator_kind = validator
@@ -26,6 +27,7 @@ def __init__(self, name, def_iter, validator, uuid, save, iters, silent):
         self.valid_result = None
         self.validation_reasons = None
         self.reset()
+        self.attack_conf = attack_conf or {}
 
     def reset(self):
         self.cls = None
@@ -96,7 +98,8 @@ def adv_proto_valid(self):
     def label_stats(self) -> dict:
         labels = []
         if self.use_validator and self.n_valid > 0:
-            labels = self.adv_y[self.idx_valid_evades].flatten().tolist()
+            labels = self.adv_y[
+                self.idx_valid_evades].flatten().tolist()
         elif self.n_evasions > 0:
             labels = self.adv_y[self.evasions].flatten().tolist()
         return dict([(self.cls.text_label(c), labels.count(c))
@@ -106,7 +109,8 @@ def get_proto_stats(self, records) -> dict:
         if not self.use_validator:
             return {}
         labels = [Validator.determine_proto(
-            self.validator_kind, self.cls.attrs, r).name for r in records]
+            self.validator_kind, self.cls.attrs, r).name for r in
+                  records]
         return dict(Counter(labels))
 
     def set_cls(self, cls: Classifier, indices=None):

diff --git a/src/classifier.py b/src/classifier.py
@@ -4,7 +4,8 @@
 class Classifier:
 
     def __init__(
-            self, name, out, attrs, y, robust, mask_cols, attr_ranges
+            self, name, out, attrs, y, robust, mask_cols, attr_ranges,
+            cls_conf
     ):
         self.name = name
         self.out_dir = out
@@ -26,6 +27,7 @@ def __init__(
         self.n_pred_pos = 0
         self.n_true_p = 0
         self.reset()
+        self.cls_conf = cls_conf or {}
 
     def reset(self):
         self.classifier = None

diff --git a/src/dnn.py b/src/dnn.py
@@ -32,13 +32,28 @@ class NeuralNetwork(Classifier):
 
     def __init__(self, *args):
         super().__init__('neural_network', *args)
-        self.epochs = 80
-        self.bsz = 64  # batch size
 
     @staticmethod
     def formatter(x, y):
         return x
 
+    def __c_key(self, key):
+        return self.cls_conf[key] if key in self.cls_conf else {}
+
+    def __model(self, key):
+        m = self.__c_key('model')
+        return m[key] if key in m else None
+
+    def __m_train(self, key=None):
+        m = self.__c_key('model_fit')
+        return m if not key else (m[key] if key in m else None)
+
+    def dnn_config(self):
+        lrs = self.__model('layers') or \
+              [60 for _ in range(max(1, len(self.mutable) // 4))]
+        bs = gcd(self.n_train, (self.__m_train('batch_size') or 64))
+        return lrs, {'epochs': 80, **self.__m_train(), 'batch_size': bs}
+
     def predict(self, data):
         tmp = self.model.predict(data)
         ax = 1 if len(tmp.shape) == 2 else 0
@@ -48,25 +63,20 @@ def _set_cls(self, cls):
         self.classifier = cls
         self.model = cls.model
 
-    @property
-    def model_fit_kwargs(self):
-        return {'callbacks': [EarlyStopping(monitor='loss', patience=5)],
-                'shuffle': True, 'verbose': False}
-
     def init_classifier(self):
         """Trains a deep neural network classifier."""
-        n_layers = max(1, len(self.mutable) // 4)
-        layers = [Dense(60, activation='relu') for _ in range(n_layers)] + \
+        layers, args = self.dnn_config()
+        layers = [Dense(v, activation='relu') for v in layers] + \
                  [Dense(self.n_classes, activation='softmax')]
         model = tf.keras.models.Sequential(layers)
         model.compile(
             optimizer=SGD(),
             loss=SparseCategoricalCrossentropy(),
             metrics=[SparseCategoricalAccuracy()])
         model.fit(
-            self.train_x, self.train_y, epochs=self.epochs,
-            batch_size=gcd(self.bsz, self.n_train),
-            **self.model_fit_kwargs)
+            self.train_x, self.train_y,
+            shuffle=True, verbose=False, **args,
+            callbacks=[EarlyStopping(monitor='loss', patience=5)])
         return KerasClassifier(model=model, clip_values=(0, 1))
 
     def init_robust(self):
@@ -77,7 +87,7 @@ def init_robust(self):
         trainer = AdversarialTrainer(
             # Model to train adversarially
             classifier=robust_classifier,
-            # Attacks to use for data augmentation in adversarial training
+            # Attacks to use for data augmentation
             attacks=attack,
             # Proportion of samples to be replaced with adversarial
             # counterparts. Value 1 trains only on adversarial samples.

diff --git a/src/experiment.py b/src/experiment.py
@@ -121,11 +121,12 @@ def valid_ratio(self) -> float:
 
     DEFAULT_DS = 'data/CTU.csv'
     DEFAULT_CLS = ClsLoader.XGB
+    DEFAULT_CF = 'config/default.yaml'
     ATTACKS = [AttackLoader.HSJ, AttackLoader.ZOO]
     CLASSIFIERS = [ClsLoader.XGB, ClsLoader.DNN]
     VALIDATORS = [Validator.NB15, Validator.IOT23]
 
-    def __init__(self, uuid, **kwargs):
+    def __init__(self, uuid, config_obj, **kwargs):
         self.uuid = uuid
         self.start_time = 0
         self.end_time = 0
@@ -138,8 +139,10 @@ def __init__(self, uuid, **kwargs):
         self.mask_cols = []
         self.attr_ranges = {}
         self.stats = Experiment.Result()
-        config_keys = ",".join(kwargs.keys())
-        self.config = namedtuple('exp', config_keys)(**kwargs)
+        config_keys = ",".join(
+            list(kwargs.keys()) + list(config_obj.keys()))
+        self.config = (namedtuple('exp', config_keys)
+                       (**kwargs, **config_obj))
 
     @property
     def n_records(self) -> int:
@@ -168,6 +171,10 @@ def is_repeat(self):
                 if f.startswith(match)]
         return prev[0] if len(prev) > 0 else None
 
+    def custom_config(self, key):
+        return getattr(self.config, key) \
+            if key and hasattr(self.config, key) else None
+
     def load_csv(self, ds_path: str, n_splits: int):
         self.attrs, rows = utility.read_dataset(ds_path)
         self.X = rows[:, :-1]
@@ -181,37 +188,17 @@ def load_csv(self, ds_path: str, n_splits: int):
             if set(col_values).issubset({0, 1}):
                 self.mask_cols.append(col_i)
 
-    def exec_fold(self, fold_num: int, fold_indices: List[int]):
-        self.cls.reset() \
-            .load(self.X.copy(), self.y.copy(), *fold_indices, fold_num) \
-            .train()
-        self.stats.append_cls(self.cls)
-        self.log_training_result(fold_num)
-        sample_size = 0 if self.config.sample_size < 1 else \
-            min(self.config.sample_size, self.cls.n_test)
-
-        if self.attack:
-            for n in range(self.config.sample_times):
-                sample_idx = None if sample_size < 1 else \
-                    sample(range(0, self.cls.n_test), sample_size)
-                self.attack.reset().set_cls(self.cls, sample_idx).run()
-                self.attack.eval_examples()
-                self.attack.validate()
-                if self.attack.save_records and self.attack.n_evasions > 0:
-                    self.attack.dump_result(self.config.out)
-                self.stats.append_attack(self.attack)
-                gc.collect()
-                self.log_fold_attack(n + 1, self.config.sample_times)
-
     def run(self):
         config, prev = self.config, self.is_repeat
         if config.resume and prev:
             return print('Saved result to', prev)
         self.load_csv(config.dataset, config.folds)
         cls_args = (config.cls, config.out, self.attrs, self.y,
-                    config.robust, self.mask_cols, self.attr_ranges)
+                    config.robust, self.mask_cols, self.attr_ranges,
+                    self.custom_config(config.cls))
         atk_args = (config.attack, config.validator, self.uuid,
-                    config.capture, config.iter, config.silent)
+                    config.capture, config.iter, config.silent,
+                    self.custom_config(config.attack))
         self.cls = ClsLoader.init(*cls_args)
         self.attack = AttackLoader.load(*atk_args) \
             if config.attack else None
@@ -231,6 +218,28 @@ def run(self):
         self.save_result()
         self.cleanup()
 
+    def exec_fold(self, fold_num: int, fold_indices: List[int]):
+        self.cls.reset() \
+            .load(self.X.copy(), self.y.copy(), *fold_indices, fold_num) \
+            .train()
+        self.stats.append_cls(self.cls)
+        self.log_training_result(fold_num)
+        sample_size = 0 if self.config.sample_size < 1 else \
+            min(self.config.sample_size, self.cls.n_test)
+
+        if self.attack:
+            for n in range(self.config.sample_times):
+                sample_idx = None if sample_size < 1 else \
+                    sample(range(0, self.cls.n_test), sample_size)
+                self.attack.reset().set_cls(self.cls, sample_idx).run()
+                self.attack.eval_examples()
+                self.attack.validate()
+                if self.attack.save_records and self.attack.n_evasions > 0:
+                    self.attack.dump_result(self.config.out)
+                self.stats.append_attack(self.attack)
+                gc.collect()
+                self.log_fold_attack(n + 1, self.config.sample_times)
+
     def log_experiment_setup(self):
         Show('Dataset', self.config.dataset)
         Show('Record count', self.n_records)
@@ -261,11 +270,14 @@ def log_fold_attack(self, sample_n: int, n_total: int):
         if n_total > 1:
             print('-' * 5)
         if n_total > 1:
-            Show('Sampling round', f'{sample_n}/{self.config.sample_times}')
+            Show('Sampling round',
+                 f'{sample_n}/{self.config.sample_times}')
         if self.attack:
-            Ratio('Evasions', self.attack.n_evasions, self.attack.n_records)
+            Ratio('Evasions', self.attack.n_evasions,
+                  self.attack.n_records)
             if self.attack.use_validator:
-                Ratio('Valid', self.attack.n_valid, self.attack.n_evasions)
+                Ratio('Valid', self.attack.n_valid,
+                      self.attack.n_evasions)
             if self.attack.has_evasions:
                 Show('Class labels',
                      utility.dump_num_dict(self.attack.label_stats))
@@ -280,9 +292,11 @@ def log_experiment_result(self):
         Show('Avg. Recall', f'{(self.stats.recall * 100):.2f} %')
         Show('Avg. F-score', f'{(self.stats.f_score * 100):.2f} %')
         if self.attack:
-            Ratio('Evasions', self.stats.n_evasions, self.stats.n_records)
+            Ratio('Evasions', self.stats.n_evasions,
+                  self.stats.n_records)
             if self.attack.use_validator:
-                Ratio('Valid', self.stats.n_valid, self.stats.n_evasions)
+                Ratio('Valid', self.stats.n_valid,
+                      self.stats.n_evasions)
         Show('Time', "{0} min {1:.2f} s".format(*self.duration))
 
     def to_dict(self) -> dict: