From 3d1e4e9e2b0c3190e80adf6bd36f921aaa1783ff Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 14 Sep 2018 16:40:27 +0800 Subject: [PATCH 01/26] fix nnictl bug --- tools/nnicmd/nnictl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 73b2950a55..9dd9f8dfa9 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -29,7 +29,7 @@ def nni_help_info(*args): def parse_args(): '''Definite the arguments users need to follow and input''' - parser = argparse.ArgumentParser(prog='nni ctl', description='use nni control') + parser = argparse.ArgumentParser(prog='nnictl', description='use nnictl command to control nni experiments') parser.set_defaults(func=nni_help_info) # create subparsers for args with sub values From 2b01089f26bbde224b5c5aac7b8448d4d84ed975 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Sun, 30 Sep 2018 11:20:29 +0800 Subject: [PATCH 02/26] fix install.sh --- install.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/install.sh b/install.sh index 0c3d39bcf1..3d5199e187 100644 --- a/install.sh +++ b/install.sh @@ -1,7 +1,3 @@ #!/bin/bash -make install-dependencies -make build -make dev-install -make install-examples -make update-bash-config +make easy-install source ~/.bashrc From 346badd0fbe737870200d182847436b65c3d3a7f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 10 Oct 2018 18:19:19 +0800 Subject: [PATCH 03/26] add desc for Dockerfile.build.base --- deployment/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deployment/README.md b/deployment/README.md index b19ff06260..7da20075ec 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,6 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. +Dockerfile.build.base is the base Docker, including Ubuntu, cudnn and the NNI environment. +Dockerfile is the customized docker for users, if you want to add your own deeplearning environment, you could update this Dockerfile. ## 2.Including Libraries ``` From 46a8350883153a1ec98d438ff36c5182f872f7e0 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 11 Oct 2018 17:19:27 +0800 Subject: [PATCH 04/26] update document for Dockerfile --- deployment/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/README.md b/deployment/README.md index 7da20075ec..19b84cba3f 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,8 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile.build.base is the base Docker, including Ubuntu, cudnn and the NNI environment. -Dockerfile is the customized docker for users, if you want to add your own deeplearning environment, you could update this Dockerfile. +Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. +Dockerfile could build the customized docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` From a8708174b7ec69bd1297e19a2f9c4a4cf6372478 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 15:36:47 +0800 Subject: [PATCH 05/26] update --- tools/nnicmd/common_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index ab5f5b11c9..164e5743d6 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -67,7 +67,7 @@ def detect_port(port): socket_test = socket.socket(socket.AF_INET,socket.SOCK_STREAM) try: socket_test.connect(('127.0.0.1', int(port))) - socket_test.shutdown(2) + socket_test.close() return True except: return False From b45268cff7ffd58c7c4ca25550f5e44cfb13ab45 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 15:50:04 +0800 Subject: [PATCH 06/26] refactor port detect --- tools/nnicmd/common_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index 164e5743d6..0ffd8eb8f9 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -70,4 +70,8 @@ def detect_port(port): socket_test.close() return True except: + try: + socket_test.close() + except: + return False return False From 59626ecee84804aff19dc97b5a56f2b0b2230301 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 16:03:36 +0800 Subject: [PATCH 07/26] update --- tools/nnicmd/common_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index 0ffd8eb8f9..164e5743d6 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -70,8 +70,4 @@ def detect_port(port): socket_test.close() return True except: - try: - socket_test.close() - except: - return False return False From 2ca84c5e5d378905d1bedabd8864676b0d42150d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 12:29:40 +0800 Subject: [PATCH 08/26] refactor NNICTLDOC.md --- docs/NNICTLDOC.md | 78 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 5e269b950e..8139f5b8c4 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -14,6 +14,7 @@ nnictl trial nnictl experiment nnictl config nnictl log +nnictl webui ``` ### Manage an experiment * __nnictl create__ @@ -33,7 +34,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --config, -c| True| |yaml configure file of the experiment| - + | --port, -p | False| |the port of restful server| * __nnictl resume__ @@ -56,11 +57,20 @@ nnictl log * __nnictl stop__ * Description - You can use this command to stop a running experiment. + You can use this command to stop a running experiment or multiple experiments. * Usage - nnictl stop + nnictl stop [id] + + * Detail + + 1.If there is an id specified, and the id matches the running experiment, nnictl will stop the corresponding experiment, or will print error message. + 2.If there is no id specified, and there is an experiment running, stop the running experiment, or print error message. + 3.If the id ends with *, nnictl will stop all experiments whose ids matchs the regular. + 4.If the id does not exist but match the prefix of an experiment id, nnictl will stop the matched experiment. + 5.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information. + 6.Users could use 'nnictl stop all' to stop all experiments * __nnictl update__ @@ -78,6 +88,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --filename, -f| True| |the file storing your new search space| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl update concurrency__ * Description @@ -93,6 +104,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --value, -v| True| |the number of allowed concurrent trials| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl update duration__ * Description @@ -108,6 +120,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl trial__ @@ -120,6 +133,12 @@ nnictl log nnictl trial ls + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + * __nnictl trial kill__ * Description @@ -132,7 +151,8 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --trialid, -t| True| |ID of the trial you want to kill.| + | --trialid, -t| True| |ID of the trial you want to kill.| + | --id, -i| False| |ID of the experiment you want to set| @@ -146,6 +166,36 @@ nnictl log * Usage nnictl experiment show + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + + +* __nnictl experiment status__ + * Description + + Show the status of experiment. + * Usage + + nnictl experiment status + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + + +* __nnictl experiment list__ + * Description + + Show the id and start time of all running experiments. + * Usage + + nnictl experiment list @@ -176,6 +226,7 @@ nnictl log | --head, -h| False| |show head lines of stdout| | --tail, -t| False| |show tail lines of stdout| | --path, -p| False| |show the path of stdout file| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl log stderr__ * Description @@ -193,6 +244,7 @@ nnictl log | --head, -h| False| |show head lines of stderr| | --tail, -t| False| |show tail lines of stderr| | --path, -p| False| |show the path of stderr file| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl log trial__ * Description @@ -208,4 +260,20 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --id, -I| False| |the id of trial| - \ No newline at end of file + + +### Manage webui +* __nnictl webui url__ + * Description + + Show the urls of the experiment. + + * Usage + + nnictl webui url + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| \ No newline at end of file From ab02c93c864ff5545d50aff15cacf676948ab8ce Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 16:11:56 +0800 Subject: [PATCH 09/26] add document for pai and nnictl --- docs/ExperimentConfig.md | 16 +++++++++------- docs/GetStarted.md | 2 +- docs/RemoteMachineMode.md | 15 +++++++++------ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/ExperimentConfig.md b/docs/ExperimentConfig.md index 8f31129654..74fc121f3f 100644 --- a/docs/ExperimentConfig.md +++ b/docs/ExperimentConfig.md @@ -12,7 +12,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: searchSpacePath: #choice: true, false @@ -42,7 +42,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: searchSpacePath: #choice: true, false @@ -79,7 +79,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: #choice: true, false useAnnotation: @@ -145,6 +145,8 @@ machineList: * __local__ mode means you run an experiment in your local linux machine. * __remote__ mode means you submit trial jobs to remote linux machines. If you set platform as remote, you should complete __machineList__ field. + + * __pai__ mode means you submit trial jobs to [OpenPai](https://github.com/Microsoft/pai) of Microsoft. For more details of pai configuration, please reference [PAIMOdeDoc](./PAIMode.md) * __searchSpacePath__ * Description @@ -268,7 +270,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local #choice: true, false useAnnotation: true @@ -292,7 +294,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -324,7 +326,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -360,7 +362,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: remote searchSpacePath: /nni/search_space.json #choice: true, false diff --git a/docs/GetStarted.md b/docs/GetStarted.md index 42366eb1e8..a98efcda7f 100644 --- a/docs/GetStarted.md +++ b/docs/GetStarted.md @@ -62,7 +62,7 @@ maxExecDuration: 3h # empty means never stop maxTrialNum: 100 -# choice: local, remote +# choice: local, remote, pai trainingServicePlatform: local # choice: true, false diff --git a/docs/RemoteMachineMode.md b/docs/RemoteMachineMode.md index 94f393324d..e7bf77888b 100644 --- a/docs/RemoteMachineMode.md +++ b/docs/RemoteMachineMode.md @@ -2,11 +2,11 @@ === NNI supports running an experiment on multiple machines, called remote machine mode. Let's say you have multiple machines with the account `bob` (Note: the account is not necessarily the same on multiple machines): -| IP | Username| Password | -| -------- |---------|-------| -| 10.1.1.1 | bob | bob123 | -| 10.1.1.2 | bob | bob123 | -| 10.1.1.3 | bob | bob123 | +| IP | Username| Password | Port | +| -------- |---------|-------|-------| +| 10.1.1.1 | bob | bob123 |22| +| 10.1.1.2 | bob | bob123 |22| +| 10.1.1.3 | bob | bob123 |22| ## Setup environment Install NNI on each of your machines following the install guide [here](GetStarted.md). @@ -34,7 +34,7 @@ trialConcurrency: 2 maxExecDuration: 3h # empty means never stop maxTrialNum: 100 -# choice: local, remote +# choice: local, remote, pai trainingServicePlatform: local # choice: true, false useAnnotation: true @@ -51,12 +51,15 @@ machineList: - ip: 10.1.1.1 username: bob passwd: bob123 + port: 22 - ip: 10.1.1.2 username: bob passwd: bob123 + port: 22 - ip: 10.1.1.3 username: bob passwd: bob123 + port: 22 ``` Simply filling the `machineList` section. This yaml file is named `exp_remote.yaml`, then run: ``` From 5ff7b4574e013bf2040e560fbd5a321f78fc946b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 20:20:30 +0800 Subject: [PATCH 10/26] add default value for port --- docs/RemoteMachineMode.md | 13 +++++-------- tools/nnicmd/config_schema.py | 4 ++-- tools/nnicmd/launcher_utils.py | 5 +++++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/RemoteMachineMode.md b/docs/RemoteMachineMode.md index e7bf77888b..8c4d90ac3d 100644 --- a/docs/RemoteMachineMode.md +++ b/docs/RemoteMachineMode.md @@ -2,11 +2,11 @@ === NNI supports running an experiment on multiple machines, called remote machine mode. Let's say you have multiple machines with the account `bob` (Note: the account is not necessarily the same on multiple machines): -| IP | Username| Password | Port | -| -------- |---------|-------|-------| -| 10.1.1.1 | bob | bob123 |22| -| 10.1.1.2 | bob | bob123 |22| -| 10.1.1.3 | bob | bob123 |22| +| IP | Username| Password | +| -------- |---------|-------| +| 10.1.1.1 | bob | bob123 | +| 10.1.1.2 | bob | bob123 | +| 10.1.1.3 | bob | bob123 | ## Setup environment Install NNI on each of your machines following the install guide [here](GetStarted.md). @@ -51,15 +51,12 @@ machineList: - ip: 10.1.1.1 username: bob passwd: bob123 - port: 22 - ip: 10.1.1.2 username: bob passwd: bob123 - port: 22 - ip: 10.1.1.3 username: bob passwd: bob123 - port: 22 ``` Simply filling the `machineList` section. This yaml file is named `exp_remote.yaml`, then run: ``` diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index ace9621a5a..129f3392de 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -92,12 +92,12 @@ machine_list_schima = { Optional('machineList'):[Or({ 'ip': str, - 'port': And(int, lambda x: 0 < x < 65535), + Optional('port'): And(int, lambda x: 0 < x < 65535), 'username': str, 'passwd': str },{ 'ip': str, - 'port': And(int, lambda x: 0 < x < 65535), + Optional('port'): And(int, lambda x: 0 < x < 65535), 'username': str, 'sshKeyPath': os.path.exists, Optional('passphrase'): str diff --git a/tools/nnicmd/launcher_utils.py b/tools/nnicmd/launcher_utils.py index 30c9cea13e..7c811610e9 100644 --- a/tools/nnicmd/launcher_utils.py +++ b/tools/nnicmd/launcher_utils.py @@ -97,6 +97,11 @@ def validate_common_content(experiment_config): experiment_config['maxExecDuration'] = '999d' if experiment_config.get('maxTrialNum') is None: experiment_config['maxTrialNum'] = 99999 + if experiment_config['trainingServicePlatform'] == 'remote': + for index in range(len(experiment_config['machineList'])): + if experiment_config['machineList'][index].get('port') is None: + experiment_config['machineList'][index]['port'] = 22 + except Exception as exception: raise Exception(exception) From 5ae146d7e6ab597e5c43a690c491a14e452be734 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 18 Oct 2018 10:40:48 +0800 Subject: [PATCH 11/26] add exception handling in trial_keeper.py --- tools/trial_tool/trial_keeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/trial_tool/trial_keeper.py b/tools/trial_tool/trial_keeper.py index 675a0566ac..ab1b42ac64 100644 --- a/tools/trial_tool/trial_keeper.py +++ b/tools/trial_tool/trial_keeper.py @@ -54,8 +54,8 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) try: + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): print('copy directory from {0} to {1} success!'.format(nni_local_output_dir, args.pai_hdfs_output_dir)) else: From 1dde461f882975860dff6c54bf6b264c9929733e Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 18 Oct 2018 16:12:19 +0800 Subject: [PATCH 12/26] fix port bug --- tools/nnicmd/nnictl_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index a400170cb3..0aa31cf635 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -54,7 +54,7 @@ def get_experiment_port(args): if not args.id: return list(experiment_dict.values())[0][0] if experiment_dict.get(args.id): - return experiment_dict[args.id] + return experiment_dict[args.id][0] else: print_error('Id not correct!') return None From 9fdf6d451219f535a7326b4533be20ddc0d4f35f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 16:51:03 +0800 Subject: [PATCH 13/26] fix resume --- tools/nnicmd/launcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index c9da0a4518..d2df91b660 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -305,7 +305,10 @@ def resume_experiment(args): '''resume an experiment''' nni_config = Config(args.port) experiment_config = nni_config.get_config('experimentConfig') - experiment_id = nni_config.get_config('experimentId') + if args.id: + experiment_id = args.id + else: + experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): From c1285f88b8797b5d4f9f4f128e5953bf84024682 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:18:09 +0800 Subject: [PATCH 14/26] fix nnictl resume and fix nnictl stop --- deployment/Dockerfile | 82 ++++++++++++++++++++++++++++++- deployment/Dockerfile.build.base | 83 -------------------------------- deployment/README.md | 6 +-- tools/nnicmd/launcher.py | 11 +++-- tools/nnicmd/nnictl.py | 4 +- tools/nnicmd/nnictl_utils.py | 2 +- 6 files changed, 90 insertions(+), 98 deletions(-) delete mode 100644 deployment/Dockerfile.build.base diff --git a/deployment/Dockerfile b/deployment/Dockerfile index d0ddf99587..8ad2632402 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -1,7 +1,85 @@ -FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' +ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + +RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + +# +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + +# +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + +ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + # #Tensorflow 1.10.0 # @@ -12,4 +90,4 @@ RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0 # RUN pip3 --no-cache-dir install Keras==2.1.6 -WORKDIR /root \ No newline at end of file +WORKDIR /root diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base deleted file mode 100644 index 56315a3b5f..0000000000 --- a/deployment/Dockerfile.build.base +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -LABEL maintainer='Microsoft NNI Team' - -ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - -RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - -# -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - -# -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - -ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - -WORKDIR /root diff --git a/deployment/README.md b/deployment/README.md index 19b84cba3f..c9bd2e8175 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,8 +2,7 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. -Dockerfile could build the customized docker image, users could build their customized docker image using this file. +Dockerfile could build docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` @@ -17,6 +16,5 @@ NNI v0.1 ## 3 How to run - docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 . docker build -t nni/nni . - nvidia-docker run -it nni/nni \ No newline at end of file + nvidia-docker run -it nni/nni \ No newline at end of file diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index d2df91b660..208e48dc46 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,6 +34,7 @@ from .constants import * from .webui_utils import * import time +from .nnictl_utils import get_experiment_port def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' @@ -303,12 +304,12 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): def resume_experiment(args): '''resume an experiment''' - nni_config = Config(args.port) + port = get_experiment_port(args) + if port is None: + return None + nni_config = Config(port) experiment_config = nni_config.get_config('experimentConfig') - if args.id: - experiment_id = args.id - else: - experiment_id = nni_config.get_config('experimentId') + experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index a82891247f..d394dc51ca 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,9 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--experiment', '-e', dest='id', help='ID of the experiment you want to resume') - parser_resume.add_argument('--manager', '-m', default='nnimanager', dest='manager') - parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') + parser_resume.add_argument('--id', '-i', dest='id', help='ID of the experiment you want to resume') parser_resume.set_defaults(func=resume_experiment) # parse update command diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 0aa31cf635..a852b8f86c 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -137,6 +137,7 @@ def stop_experiment(args): rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): print_normal('Experiment is not running...') + experiment_config.remove_experiment(experiment_id) return running, _ = check_rest_server_quick(rest_port) stop_rest_result = True @@ -153,7 +154,6 @@ def stop_experiment(args): call(cmds) if stop_rest_result: print_normal('Stop experiment success!') - experiment_config = Experiments() experiment_config.remove_experiment(experiment_id) def trial_ls(args): From af0d081401b2fb9ba57b9e0bb5568a01741f9480 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:24:30 +0800 Subject: [PATCH 15/26] fix document --- docs/NNICTLDOC.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 8139f5b8c4..d123c69aff 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --experiment, -e| False| |ID of the experiment you want to resume| + | --id, -i| False| |ID of the experiment you want to resume| From 7ce8fd88ca9fb994187a5815cc37a8031b842be1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:37:29 +0800 Subject: [PATCH 16/26] update --- docs/NNICTLDOC.md | 2 +- tools/nnicmd/launcher.py | 6 +----- tools/nnicmd/nnictl.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index d123c69aff..6a54b72a6e 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to resume| + | --port, -p| False| |Rest port of the experiment you want to resume| diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 208e48dc46..c9da0a4518 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,7 +34,6 @@ from .constants import * from .webui_utils import * import time -from .nnictl_utils import get_experiment_port def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' @@ -304,10 +303,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): def resume_experiment(args): '''resume an experiment''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(args.port) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index d394dc51ca..27a301b8d9 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--id', '-i', dest='id', help='ID of the experiment you want to resume') + parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='Rest port of the experiment you want to resume') parser_resume.set_defaults(func=resume_experiment) # parse update command From b29aaed48c863ffe4db7ea316d933f3c173ad31d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 21:02:25 +0800 Subject: [PATCH 17/26] refactor nnictl --- docs/NNICTLDOC.md | 9 +- tools/nnicmd/config_utils.py | 17 +++- tools/nnicmd/constants.py | 4 +- tools/nnicmd/launcher.py | 42 +++++---- tools/nnicmd/nnictl.py | 6 +- tools/nnicmd/nnictl_utils.py | 177 +++++++++++++++++++---------------- tools/nnicmd/updater.py | 5 +- 7 files changed, 152 insertions(+), 108 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 6a54b72a6e..ee47e203c9 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -50,6 +50,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --port, -p| False| |Rest port of the experiment you want to resume| + | --id, -i| True| |The id of the experiment you want to resume| @@ -192,11 +193,17 @@ nnictl webui * __nnictl experiment list__ * Description - Show the id and start time of all running experiments. + Show the information of all running experiments. * Usage nnictl experiment list + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --all| False| False|Show all of experiments, including stopped experiments.| + * __nnictl config show__ diff --git a/tools/nnicmd/config_utils.py b/tools/nnicmd/config_utils.py index 9e1fb7ae91..b84e29ebbf 100644 --- a/tools/nnicmd/config_utils.py +++ b/tools/nnicmd/config_utils.py @@ -73,11 +73,24 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, id, port, time): + def add_experiment(self, id, port, time, file_name): '''set {key:value} paris to self.experiment''' - self.experiments[id] = [port, time] + self.experiments[id] = {} + self.experiments[id]['port'] = port + self.experiments[id]['startTime'] = time + self.experiments[id]['endTime'] = 'N/A' + self.experiments[id]['status'] = 'running' + self.experiments[id]['fileName'] = file_name self.write_file() + def update_experiment(self, id, key, value): + '''Update experiment''' + if id not in self.experiments: + return False + self.experiments[id][key] = value + self.write_file() + return True + def remove_experiment(self, id): '''remove an experiment by id''' if id in self.experiments: diff --git a/tools/nnicmd/constants.py b/tools/nnicmd/constants.py index 71c3d2112c..fec3b47b24 100644 --- a/tools/nnicmd/constants.py +++ b/tools/nnicmd/constants.py @@ -54,11 +54,13 @@ EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' -EXPERIMENT_ID_INFO = '-----------------------------------------------------------------------\n' \ +EXPERIMENT_INFORMATION_FORMAT = '-----------------------------------------------------------------------\n' \ ' Experiment information\n' \ '%s\n' \ '-----------------------------------------------------------------------\n' +EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s StartTime: %s EndTime: %s \n' + PACKAGE_REQUIREMENTS = { 'SMAC': 'smac_tuner' } diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index c9da0a4518..645caaa60f 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,17 +34,15 @@ from .constants import * from .webui_utils import * import time +import random +import string + +CONFIG_FILE_NAME = ''.join(random.sample(string.ascii_letters + string.digits, 8)) def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' + global CONFIG_FILE_NAME print_normal('Checking environment...') - nni_config = Config(port) - rest_port = nni_config.get_config('restServerPort') - running, _ = check_rest_server_quick(rest_port) - if rest_port and running: - print_error(EXPERIMENT_START_FAILED_INFO % port) - exit(1) - if detect_port(port): print_error('Port %s is used by another process, please reset the port!' % port) exit(1) @@ -54,8 +52,8 @@ def start_rest_server(port, platform, mode, experiment_id=None): cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - stdout_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stdout') - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stdout_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stdout') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') stdout_file = open(stdout_full_path, 'a+') stderr_file = open(stderr_full_path, 'a+') time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) @@ -89,7 +87,7 @@ def set_trial_config(experiment_config, port): return True else: print('Error message is {}'.format(response.text)) - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) return False @@ -108,7 +106,7 @@ def set_remote_config(experiment_config, port): if not response or not check_response(response): if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message @@ -125,7 +123,7 @@ def set_pai_config(experiment_config, port): if not response or not response.status_code == 200: if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message @@ -191,7 +189,7 @@ def set_experiment(experiment_config, mode, port): if check_response(response): return response else: - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) print_error('Setting experiment error, error message is {}'.format(response.text)) @@ -199,7 +197,8 @@ def set_experiment(experiment_config, mode, port): def launch_experiment(args, experiment_config, mode, experiment_id=None): '''follow steps to start rest server and start experiment''' - nni_config = Config(args.port) + global CONFIG_FILE_NAME + nni_config = Config(CONFIG_FILE_NAME) # start rest server rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id) nni_config.set_config('restServerPid', rest_process.pid) @@ -297,20 +296,29 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): #save experiment information experiment_config = Experiments() - experiment_config.add_experiment(experiment_id, args.port, start_time) + experiment_config.add_experiment(experiment_id, args.port, start_time, CONFIG_FILE_NAME) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) def resume_experiment(args): '''resume an experiment''' - nni_config = Config(args.port) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + if experiment_dict.get(args.id) is None: + print_error('Id not exist!') + exit(1) + if experiment_dict[args.id]['status'] == 'running': + print_error('Experiment %s is running!' % args.id) + exit(1) + nni_config = Config(experiment_dict[args.id]['fileName']) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): '''start a new experiment''' - nni_config = Config(args.port) + global CONFIG_FILE_NAME + nni_config = Config(CONFIG_FILE_NAME) config_path = os.path.abspath(args.config) if not os.path.exists(config_path): print_error('Please set correct config path!') diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 27a301b8d9..3328903866 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,8 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='Rest port of the experiment you want to resume') + parser_resume.add_argument('--id', '-i', dest='id', required=True, help='The id of the experiment you want to resume') + parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.set_defaults(func=resume_experiment) # parse update command @@ -93,7 +94,8 @@ def parse_args(): parser_experiment_status.add_argument('--id', '-i', dest='id', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') - parser_experiment_list.set_defaults(func=experiment_id) + parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') + parser_experiment_list.set_defaults(func=experiment_list) #TODO:finish webui function #parse board command diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index a852b8f86c..4931c1b954 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -22,42 +22,64 @@ import psutil import json import datetime +import time from subprocess import call, check_output from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .config_utils import Config, Experiments from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url -from .constants import NNICTL_HOME_DIR, EXPERIMENT_ID_INFO +from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT import time -from .common_utils import print_normal, print_error, detect_process +from .common_utils import print_normal, print_error, print_warning, detect_process -def get_experiment_port(args): - '''get the port of an experiment''' +def check_experiment_id(args): + '''check if the id is valid + 1.If there is an id specified, return the corresponding port + 2.If there is no id specified, and there is an experiment running, return it as default port, or return Error + 3.If the id matches an experiment, nnictl will return the id. + 4.If the id ends with *, nnictl will match all ids matchs the regular + 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id + 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information + 7.Users could use 'nnictl stop all' to stop all experiments + ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - #1.If there is an id specified, return the corresponding port - #2.If there is no id specified, and there is an experiment running, return it as default port, or return Error - #3.If the id matches an experiment, nnictl will return the id. - #4.If the id ends with *, nnictl will match all ids matchs the regular - #5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id - #6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information - #7.Users could use 'nnictl stop all' to stop all experiments if not experiment_dict: - print_normal('Experiment is not running...') - return None - if not args.id and len(experiment_dict.keys()) > 1: - print_error('There are multiple experiments running, please set the experiment id...') - experiment_information = "" - for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) - return None + print_normal('There is no experiment running...') + exit(1) if not args.id: - return list(experiment_dict.values())[0][0] + running_experiment_list = [] + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'running': + running_experiment_list.append(key) + if len(running_experiment_list) > 1: + print_error('There are multiple experiments running, please set the experiment id...') + experiment_information = "" + for key in running_experiment_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) + exit(1) + else: + return None if experiment_dict.get(args.id): - return experiment_dict[args.id][0] - else: - print_error('Id not correct!') return None + else: + print_error('Id not correct!') + exit(1) + +def get_config_filename(args): + '''get the file name of config file''' + check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[args.id]['fileName'] + +def get_experiment_port(args): + '''get the port of experiment''' + check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[args.id]['port'] def convert_time_stamp_to_date(content): '''Convert time stamp to date time format''' @@ -73,10 +95,7 @@ def convert_time_stamp_to_date(content): def check_rest(args): '''check if restful server is running''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') running, _ = check_rest_server_quick(rest_port) if not running: @@ -91,27 +110,32 @@ def parse_ids(args): if not experiment_dict: print_normal('Experiment is not running...') return None - experiment_id_list = list(experiment_dict.keys()) result_list = [] + running_experiment_list = [] + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'running': + running_experiment_list.append(key) if not args.id: - if len(experiment_id_list) > 1: + if len(running_experiment_list) > 1: print_error('There are multiple experiments running, please set the experiment id...') experiment_information = "" - for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) - return None - result_list = experiment_id_list + for key in running_experiment_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) + exit(1) + else: + result_list = running_experiment_list elif args.id == 'all': - result_list = experiment_id_list + result_list = running_experiment_list elif args.id.endswith('*'): - for id in experiment_id_list: + for id in running_experiment_list: if id.startswith(args.id[:-1]): result_list.append(id) - elif args.id in experiment_id_list: + elif args.id in running_experiment_list: result_list.append(args.id) else: - for id in experiment_id_list: + for id in running_experiment_list: if id.startswith(args.id): result_list.append(id) if len(result_list) > 1: @@ -128,16 +152,13 @@ def stop_experiment(args): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() for experiment_id in experiment_id_list: - port = experiment_dict.get(experiment_id)[0] - if port is None: - return None print_normal('Stoping experiment %s' % experiment_id) - nni_config = Config(port) + nni_config = Config(experiment_dict[experiment_id]['fileName']) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): print_normal('Experiment is not running...') - experiment_config.remove_experiment(experiment_id) + experiment_config.update_experiment(experiment_id, 'status', 'stopped') return running, _ = check_rest_server_quick(rest_port) stop_rest_result = True @@ -154,14 +175,13 @@ def stop_experiment(args): call(cmds) if stop_rest_result: print_normal('Stop experiment success!') - experiment_config.remove_experiment(experiment_id) + experiment_config.update_experiment(experiment_id, 'status', 'stopped') + time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) def trial_ls(args): '''List trial''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -182,10 +202,7 @@ def trial_ls(args): def trial_kill(args): '''List trial''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -203,10 +220,7 @@ def trial_kill(args): def list_experiment(args): '''Get experiment information''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -225,10 +239,7 @@ def list_experiment(args): def experiment_status(args): '''Show the status of experiment''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') result, response = check_rest_server_quick(rest_port) if not result: @@ -246,13 +257,11 @@ def get_log_content(file_name, cmds): def log_internal(args, filetype): '''internal function to call get_log_content''' - port = get_experiment_port(args) - if port is None: - return None + file_name = get_config_filename(args) if filetype == 'stdout': - file_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stdout') + file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stdout') else: - file_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stderr') if args.head: get_log_content(file_full_path, ['head', '-' + str(args.head), file_full_path]) elif args.tail: @@ -273,10 +282,7 @@ def log_stderr(args): def log_trial(args): ''''get trial log path''' trial_id_path_dict = {} - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -304,28 +310,33 @@ def log_trial(args): def get_config(args): '''get config info''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) print(nni_config.get_all_config()) def webui_url(args): '''show the url of web ui''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) -def experiment_id(args): - '''get the id of all experiments''' +def experiment_list(args): + '''get the information of all experiments''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() if not experiment_dict: print('There is no experiment running...') + exit(1) + experiment_id_list = [] + if args.all: + for key in experiment_dict.keys(): + experiment_id_list.append(key) else: - experiment_information = "" for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) \ No newline at end of file + if experiment_dict[key]['status'] == 'running': + experiment_id_list.append(key) + if not experiment_id_list: + print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments!') + experiment_information = "" + for key in experiment_id_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) diff --git a/tools/nnicmd/updater.py b/tools/nnicmd/updater.py index 00291fc61f..d6e9bd15ad 100644 --- a/tools/nnicmd/updater.py +++ b/tools/nnicmd/updater.py @@ -25,6 +25,7 @@ from .url_utils import experiment_url from .config_utils import Config from .common_utils import get_json_content +from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename def validate_digit(value, start, end): '''validate if a digit is valid''' @@ -56,7 +57,7 @@ def get_query_type(key): def update_experiment_profile(args, key, value): '''call restful server to update experiment profile''' - nni_config = Config(args.port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') running, _ = check_rest_server_quick(rest_port) if running: @@ -95,7 +96,7 @@ def update_duration(args): def update_trialnum(args): validate_digit(args.value, 1, 999999999) - if update_experiment_profile('maxTrialNum', int(args.value)): + if update_experiment_profile(args, 'maxTrialNum', int(args.value)): print('INFO: update %s success!' % 'trialnum') else: print('ERROR: update %s failed!' % 'trialnum') \ No newline at end of file From 683833bd3053a1e6790644c27a2b7ce6f3b7f41b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 21:19:58 +0800 Subject: [PATCH 18/26] update --- tools/nnicmd/nnictl_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 4931c1b954..21ab6128b2 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -60,26 +60,26 @@ def check_experiment_id(args): print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) else: - return None + return running_experiment_list[0] if experiment_dict.get(args.id): - return None + return args.id else: print_error('Id not correct!') exit(1) def get_config_filename(args): '''get the file name of config file''' - check_experiment_id(args) + experiment_id = check_experiment_id(args) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[args.id]['fileName'] + return experiment_dict[experiment_id]['fileName'] def get_experiment_port(args): '''get the port of experiment''' - check_experiment_id(args) + experiment_id = check_experiment_id(args) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[args.id]['port'] + return experiment_dict[experiment_id]['port'] def convert_time_stamp_to_date(content): '''Convert time stamp to date time format''' From 6149bf96fdaa446eb18235b74520642c911fc763 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 22 Oct 2018 10:49:56 +0800 Subject: [PATCH 19/26] update doc --- docs/NNICTLDOC.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index ee47e203c9..c10086d6eb 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,8 +49,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | --id, -i| True| |The id of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume| - | --id, -i| True| |The id of the experiment you want to resume| @@ -193,7 +193,7 @@ nnictl webui * __nnictl experiment list__ * Description - Show the information of all running experiments. + Show the information of all the (running) experiments. * Usage nnictl experiment list From 73bef2f7452b5aa1727e1301d65af2169aed5d9d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 22 Oct 2018 16:09:24 +0800 Subject: [PATCH 20/26] update --- tools/nnicmd/nnictl.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 3328903866..4c17ebcaf0 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--id', '-i', dest='id', required=True, help='The id of the experiment you want to resume') + parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume') parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.set_defaults(func=resume_experiment) @@ -54,15 +54,15 @@ def parse_args(): #add subparsers for parser_updater parser_updater_subparsers = parser_updater.add_subparsers() parser_updater_searchspace = parser_updater_subparsers.add_parser('searchspace', help='update searchspace') - parser_updater_searchspace.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_searchspace.add_argument('id', nargs='?', help='the id of experiment') parser_updater_searchspace.add_argument('--filename', '-f', required=True) parser_updater_searchspace.set_defaults(func=update_searchspace) parser_updater_concurrency = parser_updater_subparsers.add_parser('concurrency', help='update concurrency') - parser_updater_concurrency.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_concurrency.add_argument('id', nargs='?', help='the id of experiment') parser_updater_concurrency.add_argument('--value', '-v', required=True) parser_updater_concurrency.set_defaults(func=update_concurrency) parser_updater_duration = parser_updater_subparsers.add_parser('duration', help='update duration') - parser_updater_duration.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_duration.add_argument('id', nargs='?', help='the id of experiment') parser_updater_duration.add_argument('--value', '-v', required=True) parser_updater_duration.set_defaults(func=update_duration) @@ -76,10 +76,10 @@ def parse_args(): #add subparsers for parser_trial parser_trial_subparsers = parser_trial.add_subparsers() parser_trial_ls = parser_trial_subparsers.add_parser('ls', help='list trial jobs') - parser_trial_ls.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_trial_ls.add_argument('id', nargs='?', help='the id of experiment') parser_trial_ls.set_defaults(func=trial_ls) parser_trial_kill = parser_trial_subparsers.add_parser('kill', help='kill trial jobs') - parser_trial_kill.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_trial_kill.add_argument('id', nargs='?', help='the id of experiment') parser_trial_kill.add_argument('--trialid', '-t', required=True, dest='trialid', help='the id of trial to be killed') parser_trial_kill.set_defaults(func=trial_kill) @@ -88,10 +88,10 @@ def parse_args(): #add subparsers for parser_experiment parser_experiment_subparsers = parser_experiment.add_subparsers() parser_experiment_show = parser_experiment_subparsers.add_parser('show', help='show the information of experiment') - parser_experiment_show.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_experiment_show.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_show.set_defaults(func=list_experiment) parser_experiment_status = parser_experiment_subparsers.add_parser('status', help='show the status of experiment') - parser_experiment_status.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_experiment_status.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') @@ -103,14 +103,14 @@ def parse_args(): #add subparsers for parser_board parser_webui_subparsers = parser_webui.add_subparsers() parser_webui_url = parser_webui_subparsers.add_parser('url', help='show the url of web ui') - parser_webui_url.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_webui_url.add_argument('id', nargs='?', help='the id of experiment') parser_webui_url.set_defaults(func=webui_url) #parse config command parser_config = subparsers.add_parser('config', help='get config information') parser_config_subparsers = parser_config.add_subparsers() parser_config_show = parser_config_subparsers.add_parser('show', help='show the information of config') - parser_config_show.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_config_show.add_argument('id', nargs='?', help='the id of experiment') parser_config_show.set_defaults(func=get_config) #parse log command @@ -118,19 +118,19 @@ def parse_args(): # add subparsers for parser_log parser_log_subparsers = parser_log.add_subparsers() parser_log_stdout = parser_log_subparsers.add_parser('stdout', help='get stdout information') - parser_log_stdout.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_stdout.add_argument('id', nargs='?', help='the id of experiment') parser_log_stdout.add_argument('--tail', '-T', dest='tail', type=int, help='get tail -100 content of stdout') parser_log_stdout.add_argument('--head', '-H', dest='head', type=int, help='get head -100 content of stdout') parser_log_stdout.add_argument('--path', action='store_true', default=False, help='get the path of stdout file') parser_log_stdout.set_defaults(func=log_stdout) parser_log_stderr = parser_log_subparsers.add_parser('stderr', help='get stderr information') - parser_log_stderr.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_stderr.add_argument('id', nargs='?', help='the id of experiment') parser_log_stderr.add_argument('--tail', '-T', dest='tail', type=int, help='get tail -100 content of stderr') parser_log_stderr.add_argument('--head', '-H', dest='head', type=int, help='get head -100 content of stderr') parser_log_stderr.add_argument('--path', action='store_true', default=False, help='get the path of stderr file') parser_log_stderr.set_defaults(func=log_stderr) parser_log_trial = parser_log_subparsers.add_parser('trial', help='get trial log path') - parser_log_trial.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_trial.add_argument('id', nargs='?', help='the id of experiment') parser_log_trial.add_argument('--trialid', '-T', dest='trialid', help='find trial log path by id') parser_log_trial.set_defaults(func=log_trial) From 5c397f6e5655afebcc99c504cec795364d714c25 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 15:39:46 +0800 Subject: [PATCH 21/26] update nnictl --- tools/nnicmd/config_utils.py | 4 +- tools/nnicmd/launcher.py | 105 ++++++++++++++++++++++------------- tools/nnicmd/nnictl.py | 4 +- tools/nnicmd/nnictl_utils.py | 95 ++++++++++++++++--------------- tools/nnicmd/webui_utils.py | 4 +- 5 files changed, 120 insertions(+), 92 deletions(-) diff --git a/tools/nnicmd/config_utils.py b/tools/nnicmd/config_utils.py index b84e29ebbf..17adb05fd6 100644 --- a/tools/nnicmd/config_utils.py +++ b/tools/nnicmd/config_utils.py @@ -26,8 +26,8 @@ class Config: '''a util class to load and save config''' - def __init__(self, port): - config_path = os.path.join(NNICTL_HOME_DIR, str(port)) + def __init__(self, file_path): + config_path = os.path.join(NNICTL_HOME_DIR, str(file_path)) os.makedirs(config_path, exist_ok=True) self.config_file = os.path.join(config_path, '.config') self.config = self.read_file() diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 645caaa60f..676a2946c6 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -37,12 +37,9 @@ import random import string -CONFIG_FILE_NAME = ''.join(random.sample(string.ascii_letters + string.digits, 8)) - -def start_rest_server(port, platform, mode, experiment_id=None): +def start_rest_server(port, platform, mode, config_file_name, experiment_id=None): '''Run nni manager process''' - global CONFIG_FILE_NAME - print_normal('Checking environment...') + nni_config = Config(config_file_name) if detect_port(port): print_error('Port %s is used by another process, please reset the port!' % port) exit(1) @@ -52,8 +49,9 @@ def start_rest_server(port, platform, mode, experiment_id=None): cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - stdout_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stdout') - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + print(cmds) + stdout_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stdout') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') stdout_file = open(stdout_full_path, 'a+') stderr_file = open(stderr_full_path, 'a+') time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) @@ -64,7 +62,7 @@ def start_rest_server(port, platform, mode, experiment_id=None): process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) return process, str(time_now) -def set_trial_config(experiment_config, port): +def set_trial_config(experiment_config, port, config_file_name): '''set trial configuration''' request_data = dict() value_dict = dict() @@ -87,16 +85,16 @@ def set_trial_config(experiment_config, port): return True else: print('Error message is {}'.format(response.text)) - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) return False -def set_local_config(experiment_config, port): +def set_local_config(experiment_config, port, config_file_name): '''set local configuration''' - return set_trial_config(experiment_config, port) + return set_trial_config(experiment_config, port, config_file_name) -def set_remote_config(experiment_config, port): +def set_remote_config(experiment_config, port, config_file_name): '''Call setClusterMetadata to pass trial''' #set machine_list request_data = dict() @@ -106,15 +104,15 @@ def set_remote_config(experiment_config, port): if not response or not check_response(response): if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config - return set_trial_config(experiment_config, port), err_message + return set_trial_config(experiment_config, port, config_file_name), err_message -def set_pai_config(experiment_config, port): +def set_pai_config(experiment_config, port, config_file_name): '''set pai configuration''' pai_config_data = dict() pai_config_data['pai_config'] = experiment_config['paiConfig'] @@ -123,15 +121,15 @@ def set_pai_config(experiment_config, port): if not response or not response.status_code == 200: if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config - return set_trial_config(experiment_config, port), err_message + return set_trial_config(experiment_config, port, config_file_name), err_message -def set_experiment(experiment_config, mode, port): +def set_experiment(experiment_config, mode, port, config_file_name): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() request_data['authorName'] = experiment_config['authorName'] @@ -189,18 +187,17 @@ def set_experiment(experiment_config, mode, port): if check_response(response): return response else: - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) print_error('Setting experiment error, error message is {}'.format(response.text)) return None -def launch_experiment(args, experiment_config, mode, experiment_id=None): +def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None): '''follow steps to start rest server and start experiment''' - global CONFIG_FILE_NAME - nni_config = Config(CONFIG_FILE_NAME) + nni_config = Config(config_file_name) # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -235,7 +232,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # set remote config if experiment_config['trainingServicePlatform'] == 'remote': print_normal('Setting remote config...') - config_result, err_msg = set_remote_config(experiment_config, args.port) + config_result, err_msg = set_remote_config(experiment_config, args.port, config_file_name) if config_result: print_normal('Successfully set remote config!') else: @@ -250,7 +247,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # set local config if experiment_config['trainingServicePlatform'] == 'local': print_normal('Setting local config...') - if set_local_config(experiment_config, args.port): + if set_local_config(experiment_config, args.port, config_file_name): print_normal('Successfully set local config!') else: print_error('Failed!') @@ -264,7 +261,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): #set pai config if experiment_config['trainingServicePlatform'] == 'pai': print_normal('Setting pai config...') - config_result, err_msg = set_pai_config(experiment_config, args.port) + config_result, err_msg = set_pai_config(experiment_config, args.port, config_file_name) if config_result: print_normal('Successfully set pai config!') else: @@ -279,7 +276,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # start a new experiment print_normal('Starting experiment...') - response = set_experiment(experiment_config, mode, args.port) + response = set_experiment(experiment_config, mode, args.port, config_file_name) if response: if experiment_id is None: experiment_id = json.loads(response.text).get('experiment_id') @@ -292,33 +289,61 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): except Exception: raise Exception(ERROR_INFO % 'Restful server stopped!') exit(1) - web_ui_url_list = get_web_ui_urls(args.port) + web_ui_url_list = get_web_ui_urls(args.port, config_file_name) #save experiment information experiment_config = Experiments() - experiment_config.add_experiment(experiment_id, args.port, start_time, CONFIG_FILE_NAME) + experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) +def cmp_time(time1, time2): + '''compare the time''' + try: + time1 = time.strptime(time1,'%Y-%m-%d %H:%M:%S') + time2 = time.strptime(time2,'%Y-%m-%d %H:%M:%S') + return int(time1) - int(time2) + except: + return 0 + def resume_experiment(args): '''resume an experiment''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - if experiment_dict.get(args.id) is None: - print_error('Id not exist!') - exit(1) - if experiment_dict[args.id]['status'] == 'running': - print_error('Experiment %s is running!' % args.id) - exit(1) - nni_config = Config(experiment_dict[args.id]['fileName']) + experiment_id = None + experiment_endTime = None + #find the latest stopped experiment + if not args.id: + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'stopped': + if experiment_id is None: + experiment_id = key + experiment_endTime = experiment_dict[key]['endTime'] + else: + if cmp_time(experiment_dict[key]['endTime'], experiment_endTime) > 0: + experiment_id = key + experiment_endTime = experiment_dict[key]['endTime'] + if experiment_id is None: + print_error('There is no experiment stopped!') + exit(1) + else: + if experiment_dict.get(args.id) is None: + print_error('Id not exist!') + exit(1) + if experiment_dict[args.id]['status'] == 'running': + print_error('Experiment %s is running!' % args.id) + exit(1) + experiment_id = args.id + print_normal('Resuming experiment %s...' % experiment_id) + nni_config = Config(experiment_dict[experiment_id]['fileName']) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') - launch_experiment(args, experiment_config, 'resume', experiment_id) + launch_experiment(args, experiment_config, 'resume', experiment_dict[experiment_id]['fileName'], experiment_id) def create_experiment(args): '''start a new experiment''' - global CONFIG_FILE_NAME - nni_config = Config(CONFIG_FILE_NAME) + config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + nni_config = Config(config_file_name) config_path = os.path.abspath(args.config) if not os.path.exists(config_path): print_error('Please set correct config path!') @@ -327,5 +352,5 @@ def create_experiment(args): validate_all_content(experiment_config, config_path) nni_config.set_config('experimentConfig', experiment_config) - launch_experiment(args, experiment_config, 'new') + launch_experiment(args, experiment_config, 'new', config_file_name) nni_config.set_config('restServerPort', args.port) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 4c17ebcaf0..da56a6a763 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -94,7 +94,7 @@ def parse_args(): parser_experiment_status.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') - parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') + parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments') parser_experiment_list.set_defaults(func=experiment_list) #TODO:finish webui function @@ -140,7 +140,7 @@ def parse_args(): parser_package_subparsers = parser_package.add_subparsers() parser_package_install = parser_package_subparsers.add_parser('install', help='install packages') parser_package_install.add_argument('--name', '-n', dest='name', help='package name to be installed') - parser_package_install.set_defaults(func=package_install) + parser_package_install.set_defaults(func=package_install) parser_package_show = parser_package_subparsers.add_parser('show', help='show the information of packages') parser_package_show.set_defaults(func=package_show) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 21ab6128b2..d4d99309fd 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -33,13 +33,6 @@ def check_experiment_id(args): '''check if the id is valid - 1.If there is an id specified, return the corresponding port - 2.If there is no id specified, and there is an experiment running, return it as default port, or return Error - 3.If the id matches an experiment, nnictl will return the id. - 4.If the id ends with *, nnictl will match all ids matchs the regular - 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id - 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information - 7.Users could use 'nnictl stop all' to stop all experiments ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() @@ -59,6 +52,9 @@ def check_experiment_id(args): experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) + elif not running_experiment_list: + print_error('There is no experiment running!') + exit(1) else: return running_experiment_list[0] if experiment_dict.get(args.id): @@ -67,44 +63,15 @@ def check_experiment_id(args): print_error('Id not correct!') exit(1) -def get_config_filename(args): - '''get the file name of config file''' - experiment_id = check_experiment_id(args) - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[experiment_id]['fileName'] - -def get_experiment_port(args): - '''get the port of experiment''' - experiment_id = check_experiment_id(args) - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[experiment_id]['port'] - -def convert_time_stamp_to_date(content): - '''Convert time stamp to date time format''' - start_time_stamp = content.get('startTime') - end_time_stamp = content.get('endTime') - if start_time_stamp: - start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") - content['startTime'] = str(start_time) - if end_time_stamp: - end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") - content['endTime'] = str(end_time) - return content - -def check_rest(args): - '''check if restful server is running''' - nni_config = Config(get_config_filename(args)) - rest_port = nni_config.get_config('restServerPort') - running, _ = check_rest_server_quick(rest_port) - if not running: - print_normal('Restful server is running...') - else: - print_normal('Restful server is not running...') - def parse_ids(args): - '''Parse the arguments for nnictl stop''' + '''Parse the arguments for nnictl stop + 1.If there is an id specified, return the corresponding id + 2.If there is no id specified, and there is an experiment running, return the id, or return Error + 3.If the id matches an experiment, nnictl will return the id. + 4.If the id ends with *, nnictl will match all ids matchs the regular + 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id + 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information + ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() if not experiment_dict: @@ -145,6 +112,42 @@ def parse_ids(args): print_error('There are no experiments matched, please check experiment id...') return result_list +def get_config_filename(args): + '''get the file name of config file''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[experiment_id]['fileName'] + +def get_experiment_port(args): + '''get the port of experiment''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[experiment_id]['port'] + +def convert_time_stamp_to_date(content): + '''Convert time stamp to date time format''' + start_time_stamp = content.get('startTime') + end_time_stamp = content.get('endTime') + if start_time_stamp: + start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") + content['startTime'] = str(start_time) + if end_time_stamp: + end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") + content['endTime'] = str(end_time) + return content + +def check_rest(args): + '''check if restful server is running''' + nni_config = Config(get_config_filename(args)) + rest_port = nni_config.get_config('restServerPort') + running, _ = check_rest_server_quick(rest_port) + if not running: + print_normal('Restful server is running...') + else: + print_normal('Restful server is not running...') + def stop_experiment(args): '''Stop the experiment which is running''' experiment_id_list = parse_ids(args) @@ -326,7 +329,7 @@ def experiment_list(args): print('There is no experiment running...') exit(1) experiment_id_list = [] - if args.all: + if args.all and args.all == 'all': for key in experiment_dict.keys(): experiment_id_list.append(key) else: @@ -334,7 +337,7 @@ def experiment_list(args): if experiment_dict[key]['status'] == 'running': experiment_id_list.append(key) if not experiment_id_list: - print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments!') + print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') experiment_information = "" for key in experiment_id_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 89a5c2cf9d..69c374aebd 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -22,12 +22,12 @@ from socket import AddressFamily from .config_utils import Config -def get_web_ui_urls(port): +def get_web_ui_urls(port, CONFIG_FILE_NAME): webui_url_list = [] for name, info in psutil.net_if_addrs().items(): for addr in info: if AddressFamily.AF_INET == addr.family: webui_url_list.append('http://{}:{}'.format(addr.address, port)) - nni_config = Config(port) + nni_config = Config(CONFIG_FILE_NAME) nni_config.set_config('webuiUrl', webui_url_list) return webui_url_list From 2c68171069da5eb5f139b170238db13594032ef1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 16:32:01 +0800 Subject: [PATCH 22/26] fix comment --- tools/nnicmd/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 676a2946c6..d0da1031ea 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -328,7 +328,7 @@ def resume_experiment(args): exit(1) else: if experiment_dict.get(args.id) is None: - print_error('Id not exist!') + print_error('Id %s not exist!' % args.id) exit(1) if experiment_dict[args.id]['status'] == 'running': print_error('Experiment %s is running!' % args.id) From 8d14ca9775825f1f8160f9924f9716942c471247 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:25:09 +0800 Subject: [PATCH 23/26] revert dockerfile --- deployment/Dockerfile | 60 +------------------------- deployment/Dockerfile.build.base | 72 ++++++++++++++++++++++++++++++++ docs/NNICTLDOC.md | 31 +++++++------- 3 files changed, 89 insertions(+), 74 deletions(-) create mode 100644 deployment/Dockerfile.build.base diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 8ad2632402..c8686f9365 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -18,68 +18,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 +FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' -ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - -RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - -# -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - -# -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - -ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - # #Tensorflow 1.10.0 # diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base new file mode 100644 index 0000000000..9dffcfc428 --- /dev/null +++ b/deployment/Dockerfile.build.base @@ -0,0 +1,72 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + LABEL maintainer='Microsoft NNI Team' + ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + # +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + # +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + WORKDIR /root \ No newline at end of file diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index c10086d6eb..705bbc1ef5 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| True| |The id of the experiment you want to resume| + | id| False| |The id of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume| @@ -88,8 +88,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --filename, -f| True| |the file storing your new search space| - | --id, -i| False| |ID of the experiment you want to set| * __nnictl update concurrency__ * Description @@ -104,8 +104,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --value, -v| True| |the number of allowed concurrent trials| - | --id, -i| False| |ID of the experiment you want to set| * __nnictl update duration__ * Description @@ -120,8 +120,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| + | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| * __nnictl trial__ @@ -138,7 +138,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl trial kill__ * Description @@ -152,9 +152,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --trialid, -t| True| |ID of the trial you want to kill.| - | --id, -i| False| |ID of the experiment you want to set| - @@ -172,7 +171,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl experiment status__ @@ -187,7 +186,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl experiment list__ @@ -202,7 +201,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --all| False| False|Show all of experiments, including stopped experiments.| + | all| False| False|Show all of experiments, including stopped experiments.| @@ -230,10 +229,11 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --head, -h| False| |show head lines of stdout| | --tail, -t| False| |show tail lines of stdout| | --path, -p| False| |show the path of stdout file| - | --id, -i| False| |ID of the experiment you want to set| + * __nnictl log stderr__ * Description @@ -248,10 +248,11 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --head, -h| False| |show head lines of stderr| | --tail, -t| False| |show tail lines of stderr| | --path, -p| False| |show the path of stderr file| - | --id, -i| False| |ID of the experiment you want to set| + * __nnictl log trial__ * Description @@ -266,7 +267,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -I| False| |the id of trial| + | id| False| |the id of trial| ### Manage webui @@ -283,4 +284,4 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| \ No newline at end of file + | id| False| |ID of the experiment you want to set| \ No newline at end of file From aeb7c665357a87391cc1fe3687a02c96fd11d076 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:26:53 +0800 Subject: [PATCH 24/26] update --- deployment/Dockerfile | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/deployment/Dockerfile b/deployment/Dockerfile index c8686f9365..d0ddf99587 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -1,23 +1,3 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' @@ -32,4 +12,4 @@ RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0 # RUN pip3 --no-cache-dir install Keras==2.1.6 -WORKDIR /root +WORKDIR /root \ No newline at end of file From a25654984f0d3bb76346af2ef4bf9dc36f3f00f0 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:29:08 +0800 Subject: [PATCH 25/26] update --- deployment/Dockerfile.build.base | 155 +++++++++++++++++-------------- deployment/README.md | 6 +- 2 files changed, 87 insertions(+), 74 deletions(-) diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base index 9dffcfc428..56315a3b5f 100644 --- a/deployment/Dockerfile.build.base +++ b/deployment/Dockerfile.build.base @@ -1,72 +1,83 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - LABEL maintainer='Microsoft NNI Team' - ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - # numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - # -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - # -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - WORKDIR /root \ No newline at end of file +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +LABEL maintainer='Microsoft NNI Team' + +ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + +RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + +# +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + +# +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + +ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + +WORKDIR /root diff --git a/deployment/README.md b/deployment/README.md index c9bd2e8175..19b84cba3f 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,7 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile could build docker image, users could build their customized docker image using this file. +Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. +Dockerfile could build the customized docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` @@ -16,5 +17,6 @@ NNI v0.1 ## 3 How to run + docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 . docker build -t nni/nni . - nvidia-docker run -it nni/nni \ No newline at end of file + nvidia-docker run -it nni/nni \ No newline at end of file From 3fb0bca2e6c3429ec1313619100cf0fbeda3738d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:33:27 +0800 Subject: [PATCH 26/26] update --- tools/nnicmd/launcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index d0da1031ea..519a82383e 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -49,7 +49,6 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - print(cmds) stdout_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stdout') stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') stdout_file = open(stdout_full_path, 'a+')