Merge pull request #10 from Microsoft/master

merge master
microsoft · Sep 25, 2018 · a74febc · a74febc
2 parents f6b7c0a + 781613d
commit a74febc
Show file tree

Hide file tree

Showing 8 changed files with 266 additions and 17 deletions.
diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md
@@ -234,4 +234,19 @@ nnictl log
     | --head, -h| False| |show head lines of stderr|
     | --tail, -t|  False| |show tail lines of stderr|
 	  | --path, -p|  False| |show the path of stderr file|
+
+* __nnictl log trial__
+  * Description
+
+        Show trial log path. 
+
+  * Usage
+
+        nnictl log trial [options]
+        
+	   Options:
+	   
+      | Name, shorthand | Required|Default | Description |
+      | ------ | ------ | ------ |------ |
+    | --id, -I| False| |the id of trial|
      
diff --git a/examples/trials/ga_squad/README.md b/examples/trials/ga_squad/README.md
@@ -20,7 +20,9 @@ Also we have another version which time cost is less and performance is better.
 
 # How to run this example?
 
-## Use downloading script to download data
+## Download data
+
+### Use downloading script to download data
 
 Execute the following command to download needed files
 using the downloading script:
@@ -30,7 +32,7 @@ chmod +x ./download.sh
 ./download.sh
 ```
 
-## Download manually
+### Download manually
 
 1. download "dev-v1.1.json" and "train-v1.1.json" in https://rajpurkar.github.io/SQuAD-explorer/
 
@@ -46,7 +48,207 @@ wget http://nlp.stanford.edu/data/glove.840B.300d.zip
 unzip glove.840B.300d.zip
 ```
 
-# submit this job
+## Update configuration
+Modify `nni/examples/trials/ga_squad/config.yaml`, here is the default configuration:
+
+```
+authorName: default
+experimentName: example_ga_squad
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 1
+#choice: local, remote
+trainingServicePlatform: local
+#choice: true, false
+useAnnotation: false
+tuner:
+  codeDir: ~/nni/examples/tuners/ga_customer_tuner
+  classFileName: customer_tuner.py
+  className: CustomerTuner
+  classArgs:
+    optimize_mode: maximize
+trial:
+  command: python3 trial.py
+  codeDir: ~/nni/examples/trials/ga_squad
+  gpuNum: 0
+```
+
+In the "trial" part, if you want to use GPU to perform the architecture search, change `gpuNum` from `0` to `1`. You need to increase the `maxTrialNum` and `maxExecDuration`, according to how long you want to wait for the search result.
+
+`trialConcurrency` is the number of trials running concurrently, which is the number of GPUs you want to use, if you are setting `gpuNum` to 1.
+
+## submit this job
+
 ```
 nnictl create --config ~/nni/examples/trials/ga_squad/config.yaml
 ```
+
+# Techinal details about the trial
+
+## How does it works
+The evolution-algorithm based architecture for question answering has two different parts just like any other examples: the trial and the tuner.
+
+### The trial
+
+The trial has a lot of different files, functions and classes. Here we will only give most of those files a brief introduction:
+
+* `attention.py` contains an implementaion for attention mechanism in Tensorflow.
+* `data.py` contains functions for data preprocessing.
+* `evaluate.py` contains the evaluation script.
+* `graph.py` contains the definition of the computation graph.
+* `rnn.py` contains an implementaion for GRU in Tensorflow.
+* `train_model.py` is a wrapper for the whole question answering model.
+
+Among those files, `trial.py` and `graph_to_tf.py` is special.
+
+`graph_to_tf.py` has a function named as `graph_to_network`, here is its skelton code:
+
+```
+def graph_to_network(input1,
+                     input2,
+                     input1_lengths,
+                     input2_lengths,
+                     graph,
+                     dropout_rate,
+                     is_training,
+                     num_heads=1,
+                     rnn_units=256):
+    topology = graph.is_topology()
+    layers = dict()
+    layers_sequence_lengths = dict()
+    num_units = input1.get_shape().as_list()[-1]
+    layers[0] = input1*tf.sqrt(tf.cast(num_units, tf.float32)) + \
+        positional_encoding(input1, scale=False, zero_pad=False)
+    layers[1] = input2*tf.sqrt(tf.cast(num_units, tf.float32))
+    layers[0] = dropout(layers[0], dropout_rate, is_training)
+    layers[1] = dropout(layers[1], dropout_rate, is_training)
+    layers_sequence_lengths[0] = input1_lengths
+    layers_sequence_lengths[1] = input2_lengths
+    for _, topo_i in enumerate(topology):
+        if topo_i == '|':
+            continue
+        if graph.layers[topo_i].graph_type == LayerType.input.value:
+            # ......
+        elif graph.layers[topo_i].graph_type == LayerType.attention.value:
+            # ......
+        # More layers to handle
+```
+
+As we can see, this function is actually a compiler, that converts the internal model DAG configuration (which will be introduced in the `Model configuration format` section) `graph`, to a Tensorflow computation graph.
+
+```
+topology = graph.is_topology()
+```
+
+performs topological sorting on the internal graph representation, and the code inside the loop:
+
+```
+for _, topo_i in enumerate(topology):
+```
+
+performs actually conversion that maps each layer to a part in Tensorflow computation graph.
+
+### The tuner
+
+The tuner is much more simple than the trial. They actually share the same `graph.py`. Besides, the tuner has a `customer_tuner.py`, the most important class in which is `CustomerTuner`:
+
+```
+class CustomerTuner(Tuner):
+    # ......
+
+    def generate_parameters(self, parameter_id):
+        """Returns a set of trial graph config, as a serializable object.
+        parameter_id : int
+        """
+        if len(self.population) <= 0:
+            logger.debug("the len of poplution lower than zero.")
+            raise Exception('The population is empty')
+        pos = -1
+        for i in range(len(self.population)):
+            if self.population[i].result == None:
+                pos = i
+                break
+        if pos != -1:
+            indiv = copy.deepcopy(self.population[pos])
+            self.population.pop(pos)
+            temp = json.loads(graph_dumps(indiv.config))
+        else:
+            random.shuffle(self.population)
+            if self.population[0].result > self.population[1].result:
+                self.population[0] = self.population[1]
+            indiv = copy.deepcopy(self.population[0])
+            self.population.pop(1)
+            indiv.mutation()
+            graph = indiv.config
+            temp =  json.loads(graph_dumps(graph))
+    
+    # ......
+```
+
+As we can see, the overloaded method `generate_parameters` implements a pretty naive mutation algorithm. The code lines:
+
+```
+            if self.population[0].result > self.population[1].result:
+                self.population[0] = self.population[1]
+            indiv = copy.deepcopy(self.population[0])
+```
+
+controls the mutation process. It will always take two random individuals in the population, only keeping and mutating the one with better result.
+
+## Model configuration format
+
+Here is an example of the model configuration, which is passed from the tuner to the trial in the architecture search procedure.
+
+```
+{
+    "max_layer_num": 50,
+    "layers": [
+        {
+            "input_size": 0,
+            "type": 3,
+            "output_size": 1,
+            "input": [],
+            "size": "x",
+            "output": [4, 5],
+            "is_delete": false
+        },
+        {
+            "input_size": 0,
+            "type": 3,
+            "output_size": 1,
+            "input": [],
+            "size": "y",
+            "output": [4, 5],
+            "is_delete": false
+        },
+        {
+            "input_size": 1,
+            "type": 4,
+            "output_size": 0,
+            "input": [6],
+            "size": "x",
+            "output": [],
+            "is_delete": false
+        },
+        {
+            "input_size": 1,
+            "type": 4,
+            "output_size": 0,
+            "input": [5],
+            "size": "y",
+            "output": [],
+            "is_delete": false
+        },
+        {"Comment": "More layers will be here for actual graphs."}
+    ]
+}
+```
+
+Every model configuration will has a "layers" section, which is a JSON list of layer definitions. The definition of each layer is also a JSON object, where:
+
+ * `type` is the type of the layer. 0, 1, 2, 3, 4 corresponde to attention, self-attention, RNN, input and output layer respectively.
+ * `size` is the length of the output. "x", "y" corresponde to document length / question length, respectively.
+ * `input_size` is the number of inputs the layer has.
+ * `input` is the indices of layers taken as input of this layer.
+ * `output` is the indices of layers use this layer's output as their input.
+ * `is_delete` means whether the layer is still available.
diff --git a/install.sh b/install.sh
@@ -1,3 +1,5 @@
 #!/bin/bash
-make easy-install
+make build
+make install-dependencies
+make dev-install
 source ~/.bashrc
diff --git a/setup.py b/setup.py
@@ -81,16 +81,10 @@ def run(self):
         'pyyaml',
         'requests',
         'scipy',
-        'schema'        
-    ],
-    dependency_links = [
-        'git+https://github.com/hyperopt/hyperopt.git'
+        'schema'
     ],
 
     cmdclass={
         'install': CustomInstallCommand
-    },
-    entry_points={
-        'console_scripts': ['nnictl = nnicmd.nnictl:parse_args']
     }
 )
diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py
@@ -25,8 +25,8 @@
 'authorName': str,
 'experimentName': str,
 'trialConcurrency': And(int, lambda n: 1 <=n <= 999999),
-'maxExecDuration': Regex(r'^[1-9][0-9]*[s|m|h|d]$'),
-'maxTrialNum': And(int, lambda x: 1 <= x <= 99999),
+Optional('maxExecDuration'): Regex(r'^[1-9][0-9]*[s|m|h|d]$'),
+Optional('maxTrialNum'): And(int, lambda x: 1 <= x <= 99999),
 'trainingServicePlatform': And(str, lambda x: x in ['remote', 'local', 'pai']),
 Optional('searchSpacePath'): os.path.exists,
 'useAnnotation': bool,
@@ -41,10 +41,7 @@
     'codeDir': os.path.exists,
     'classFileName': str,
     'className': str,
-    Optional('classArgs'): {
-        Optional('optimize_mode'): Or('maximize', 'minimize'),
-        Optional('speed'): int
-        },
+    Optional('classArgs'): dict,
     Optional('gpuNum'): And(int, lambda x: 0 <= x <= 99999),
 }),
 'trial':{

diff --git a/tools/nnicmd/launcher_utils.py b/tools/nnicmd/launcher_utils.py
@@ -82,6 +82,11 @@ def validate_common_content(experiment_config):
     '''Validate whether the common values in experiment_config is valid'''
     try:
         CONFIG_SCHEMA.validate(experiment_config)
+        #set default value
+        if experiment_config.get('maxExecDuration') is None:
+            experiment_config['maxExecDuration'] = '999d'
+        if experiment_config.get('maxTrialNum') is None:
+            experiment_config['maxTrialNum'] = 99999
     except Exception as exception:
         raise Exception(exception)
 

diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py
@@ -118,6 +118,10 @@ def parse_args():
     parser_log_stderr.add_argument('--head', '-H', dest='head', type=int, help='get head -100 content of stderr')
     parser_log_stderr.add_argument('--path', '-p', action='store_true', default=False, help='get the path of stderr file')
     parser_log_stderr.set_defaults(func=log_stderr)
+    parser_log_trial = parser_log_subparsers.add_parser('trial', help='get trial log path')
+    parser_log_trial.add_argument('--id', '-I', dest='id', help='find trial log path by id')
+    parser_log_trial.set_defaults(func=log_trial)
+
 
     args = parser.parse_args()
     args.func(args)

diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py
@@ -175,6 +175,36 @@ def log_stderr(args):
     '''get stderr log'''
     log_internal(args, 'stderr')
 
+def log_trial(args):
+    ''''get trial log path'''
+    trial_id_path_dict = {}
+    nni_config = Config()
+    rest_port = nni_config.get_config('restServerPort')
+    rest_pid = nni_config.get_config('restServerPid')
+    if not detect_process(rest_pid):
+        print_error('Experiment is not running...')
+        return
+    running, response = check_rest_server_quick(rest_port)
+    if running:
+        response = rest_get(trial_jobs_url(rest_port), 20)
+        if response and check_response(response):
+            content = json.loads(response.text)
+            for trial in content:
+                trial_id_path_dict[trial['id']] = trial['logPath']
+    else:
+        print_error('Restful server is not running...')
+        exit(0)
+    if args.id:
+        if trial_id_path_dict.get(args.id):
+            print('id:' + args.id + ' path:' + trial_id_path_dict[args.id])
+        else:
+            print_error('trial id is not valid!')
+            exit(0)
+    else:
+        for key in trial_id_path_dict.keys():
+            print('id:' + key + ' path:' + trial_id_path_dict[key])
+
+
 def get_config(args):
     '''get config info'''
     nni_config = Config()