From 3d1e4e9e2b0c3190e80adf6bd36f921aaa1783ff Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 14 Sep 2018 16:40:27 +0800 Subject: [PATCH 01/34] fix nnictl bug --- tools/nnicmd/nnictl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 73b2950a55..9dd9f8dfa9 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -29,7 +29,7 @@ def nni_help_info(*args): def parse_args(): '''Definite the arguments users need to follow and input''' - parser = argparse.ArgumentParser(prog='nni ctl', description='use nni control') + parser = argparse.ArgumentParser(prog='nnictl', description='use nnictl command to control nni experiments') parser.set_defaults(func=nni_help_info) # create subparsers for args with sub values From 2b01089f26bbde224b5c5aac7b8448d4d84ed975 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Sun, 30 Sep 2018 11:20:29 +0800 Subject: [PATCH 02/34] fix install.sh --- install.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/install.sh b/install.sh index 0c3d39bcf1..3d5199e187 100644 --- a/install.sh +++ b/install.sh @@ -1,7 +1,3 @@ #!/bin/bash -make install-dependencies -make build -make dev-install -make install-examples -make update-bash-config +make easy-install source ~/.bashrc From 346badd0fbe737870200d182847436b65c3d3a7f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 10 Oct 2018 18:19:19 +0800 Subject: [PATCH 03/34] add desc for Dockerfile.build.base --- deployment/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deployment/README.md b/deployment/README.md index b19ff06260..7da20075ec 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,6 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. +Dockerfile.build.base is the base Docker, including Ubuntu, cudnn and the NNI environment. +Dockerfile is the customized docker for users, if you want to add your own deeplearning environment, you could update this Dockerfile. ## 2.Including Libraries ``` From 46a8350883153a1ec98d438ff36c5182f872f7e0 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 11 Oct 2018 17:19:27 +0800 Subject: [PATCH 04/34] update document for Dockerfile --- deployment/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deployment/README.md b/deployment/README.md index 7da20075ec..19b84cba3f 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,8 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile.build.base is the base Docker, including Ubuntu, cudnn and the NNI environment. -Dockerfile is the customized docker for users, if you want to add your own deeplearning environment, you could update this Dockerfile. +Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. +Dockerfile could build the customized docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` From a8708174b7ec69bd1297e19a2f9c4a4cf6372478 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 15:36:47 +0800 Subject: [PATCH 05/34] update --- tools/nnicmd/common_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index ab5f5b11c9..164e5743d6 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -67,7 +67,7 @@ def detect_port(port): socket_test = socket.socket(socket.AF_INET,socket.SOCK_STREAM) try: socket_test.connect(('127.0.0.1', int(port))) - socket_test.shutdown(2) + socket_test.close() return True except: return False From b45268cff7ffd58c7c4ca25550f5e44cfb13ab45 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 15:50:04 +0800 Subject: [PATCH 06/34] refactor port detect --- tools/nnicmd/common_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index 164e5743d6..0ffd8eb8f9 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -70,4 +70,8 @@ def detect_port(port): socket_test.close() return True except: + try: + socket_test.close() + except: + return False return False From 59626ecee84804aff19dc97b5a56f2b0b2230301 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 16 Oct 2018 16:03:36 +0800 Subject: [PATCH 07/34] update --- tools/nnicmd/common_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/nnicmd/common_utils.py b/tools/nnicmd/common_utils.py index 0ffd8eb8f9..164e5743d6 100644 --- a/tools/nnicmd/common_utils.py +++ b/tools/nnicmd/common_utils.py @@ -70,8 +70,4 @@ def detect_port(port): socket_test.close() return True except: - try: - socket_test.close() - except: - return False return False From 2ca84c5e5d378905d1bedabd8864676b0d42150d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 12:29:40 +0800 Subject: [PATCH 08/34] refactor NNICTLDOC.md --- docs/NNICTLDOC.md | 78 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 5e269b950e..8139f5b8c4 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -14,6 +14,7 @@ nnictl trial nnictl experiment nnictl config nnictl log +nnictl webui ``` ### Manage an experiment * __nnictl create__ @@ -33,7 +34,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --config, -c| True| |yaml configure file of the experiment| - + | --port, -p | False| |the port of restful server| * __nnictl resume__ @@ -56,11 +57,20 @@ nnictl log * __nnictl stop__ * Description - You can use this command to stop a running experiment. + You can use this command to stop a running experiment or multiple experiments. * Usage - nnictl stop + nnictl stop [id] + + * Detail + + 1.If there is an id specified, and the id matches the running experiment, nnictl will stop the corresponding experiment, or will print error message. + 2.If there is no id specified, and there is an experiment running, stop the running experiment, or print error message. + 3.If the id ends with *, nnictl will stop all experiments whose ids matchs the regular. + 4.If the id does not exist but match the prefix of an experiment id, nnictl will stop the matched experiment. + 5.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information. + 6.Users could use 'nnictl stop all' to stop all experiments * __nnictl update__ @@ -78,6 +88,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --filename, -f| True| |the file storing your new search space| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl update concurrency__ * Description @@ -93,6 +104,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --value, -v| True| |the number of allowed concurrent trials| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl update duration__ * Description @@ -108,6 +120,7 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl trial__ @@ -120,6 +133,12 @@ nnictl log nnictl trial ls + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + * __nnictl trial kill__ * Description @@ -132,7 +151,8 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --trialid, -t| True| |ID of the trial you want to kill.| + | --trialid, -t| True| |ID of the trial you want to kill.| + | --id, -i| False| |ID of the experiment you want to set| @@ -146,6 +166,36 @@ nnictl log * Usage nnictl experiment show + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + + +* __nnictl experiment status__ + * Description + + Show the status of experiment. + * Usage + + nnictl experiment status + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| + + +* __nnictl experiment list__ + * Description + + Show the id and start time of all running experiments. + * Usage + + nnictl experiment list @@ -176,6 +226,7 @@ nnictl log | --head, -h| False| |show head lines of stdout| | --tail, -t| False| |show tail lines of stdout| | --path, -p| False| |show the path of stdout file| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl log stderr__ * Description @@ -193,6 +244,7 @@ nnictl log | --head, -h| False| |show head lines of stderr| | --tail, -t| False| |show tail lines of stderr| | --path, -p| False| |show the path of stderr file| + | --id, -i| False| |ID of the experiment you want to set| * __nnictl log trial__ * Description @@ -208,4 +260,20 @@ nnictl log | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --id, -I| False| |the id of trial| - \ No newline at end of file + + +### Manage webui +* __nnictl webui url__ + * Description + + Show the urls of the experiment. + + * Usage + + nnictl webui url + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --id, -i| False| |ID of the experiment you want to set| \ No newline at end of file From ab02c93c864ff5545d50aff15cacf676948ab8ce Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 16:11:56 +0800 Subject: [PATCH 09/34] add document for pai and nnictl --- docs/ExperimentConfig.md | 16 +++++++++------- docs/GetStarted.md | 2 +- docs/RemoteMachineMode.md | 15 +++++++++------ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/ExperimentConfig.md b/docs/ExperimentConfig.md index 8f31129654..74fc121f3f 100644 --- a/docs/ExperimentConfig.md +++ b/docs/ExperimentConfig.md @@ -12,7 +12,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: searchSpacePath: #choice: true, false @@ -42,7 +42,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: searchSpacePath: #choice: true, false @@ -79,7 +79,7 @@ experimentName: trialConcurrency: maxExecDuration: maxTrialNum: -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: #choice: true, false useAnnotation: @@ -145,6 +145,8 @@ machineList: * __local__ mode means you run an experiment in your local linux machine. * __remote__ mode means you submit trial jobs to remote linux machines. If you set platform as remote, you should complete __machineList__ field. + + * __pai__ mode means you submit trial jobs to [OpenPai](https://github.com/Microsoft/pai) of Microsoft. For more details of pai configuration, please reference [PAIMOdeDoc](./PAIMode.md) * __searchSpacePath__ * Description @@ -268,7 +270,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local #choice: true, false useAnnotation: true @@ -292,7 +294,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -324,7 +326,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: local searchSpacePath: /nni/search_space.json #choice: true, false @@ -360,7 +362,7 @@ experimentName: test_experiment trialConcurrency: 3 maxExecDuration: 1h maxTrialNum: 10 -#choice: local, remote +#choice: local, remote, pai trainingServicePlatform: remote searchSpacePath: /nni/search_space.json #choice: true, false diff --git a/docs/GetStarted.md b/docs/GetStarted.md index 42366eb1e8..a98efcda7f 100644 --- a/docs/GetStarted.md +++ b/docs/GetStarted.md @@ -62,7 +62,7 @@ maxExecDuration: 3h # empty means never stop maxTrialNum: 100 -# choice: local, remote +# choice: local, remote, pai trainingServicePlatform: local # choice: true, false diff --git a/docs/RemoteMachineMode.md b/docs/RemoteMachineMode.md index 94f393324d..e7bf77888b 100644 --- a/docs/RemoteMachineMode.md +++ b/docs/RemoteMachineMode.md @@ -2,11 +2,11 @@ === NNI supports running an experiment on multiple machines, called remote machine mode. Let's say you have multiple machines with the account `bob` (Note: the account is not necessarily the same on multiple machines): -| IP | Username| Password | -| -------- |---------|-------| -| 10.1.1.1 | bob | bob123 | -| 10.1.1.2 | bob | bob123 | -| 10.1.1.3 | bob | bob123 | +| IP | Username| Password | Port | +| -------- |---------|-------|-------| +| 10.1.1.1 | bob | bob123 |22| +| 10.1.1.2 | bob | bob123 |22| +| 10.1.1.3 | bob | bob123 |22| ## Setup environment Install NNI on each of your machines following the install guide [here](GetStarted.md). @@ -34,7 +34,7 @@ trialConcurrency: 2 maxExecDuration: 3h # empty means never stop maxTrialNum: 100 -# choice: local, remote +# choice: local, remote, pai trainingServicePlatform: local # choice: true, false useAnnotation: true @@ -51,12 +51,15 @@ machineList: - ip: 10.1.1.1 username: bob passwd: bob123 + port: 22 - ip: 10.1.1.2 username: bob passwd: bob123 + port: 22 - ip: 10.1.1.3 username: bob passwd: bob123 + port: 22 ``` Simply filling the `machineList` section. This yaml file is named `exp_remote.yaml`, then run: ``` From 5ff7b4574e013bf2040e560fbd5a321f78fc946b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 17 Oct 2018 20:20:30 +0800 Subject: [PATCH 10/34] add default value for port --- docs/RemoteMachineMode.md | 13 +++++-------- tools/nnicmd/config_schema.py | 4 ++-- tools/nnicmd/launcher_utils.py | 5 +++++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/RemoteMachineMode.md b/docs/RemoteMachineMode.md index e7bf77888b..8c4d90ac3d 100644 --- a/docs/RemoteMachineMode.md +++ b/docs/RemoteMachineMode.md @@ -2,11 +2,11 @@ === NNI supports running an experiment on multiple machines, called remote machine mode. Let's say you have multiple machines with the account `bob` (Note: the account is not necessarily the same on multiple machines): -| IP | Username| Password | Port | -| -------- |---------|-------|-------| -| 10.1.1.1 | bob | bob123 |22| -| 10.1.1.2 | bob | bob123 |22| -| 10.1.1.3 | bob | bob123 |22| +| IP | Username| Password | +| -------- |---------|-------| +| 10.1.1.1 | bob | bob123 | +| 10.1.1.2 | bob | bob123 | +| 10.1.1.3 | bob | bob123 | ## Setup environment Install NNI on each of your machines following the install guide [here](GetStarted.md). @@ -51,15 +51,12 @@ machineList: - ip: 10.1.1.1 username: bob passwd: bob123 - port: 22 - ip: 10.1.1.2 username: bob passwd: bob123 - port: 22 - ip: 10.1.1.3 username: bob passwd: bob123 - port: 22 ``` Simply filling the `machineList` section. This yaml file is named `exp_remote.yaml`, then run: ``` diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index ace9621a5a..129f3392de 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -92,12 +92,12 @@ machine_list_schima = { Optional('machineList'):[Or({ 'ip': str, - 'port': And(int, lambda x: 0 < x < 65535), + Optional('port'): And(int, lambda x: 0 < x < 65535), 'username': str, 'passwd': str },{ 'ip': str, - 'port': And(int, lambda x: 0 < x < 65535), + Optional('port'): And(int, lambda x: 0 < x < 65535), 'username': str, 'sshKeyPath': os.path.exists, Optional('passphrase'): str diff --git a/tools/nnicmd/launcher_utils.py b/tools/nnicmd/launcher_utils.py index 30c9cea13e..7c811610e9 100644 --- a/tools/nnicmd/launcher_utils.py +++ b/tools/nnicmd/launcher_utils.py @@ -97,6 +97,11 @@ def validate_common_content(experiment_config): experiment_config['maxExecDuration'] = '999d' if experiment_config.get('maxTrialNum') is None: experiment_config['maxTrialNum'] = 99999 + if experiment_config['trainingServicePlatform'] == 'remote': + for index in range(len(experiment_config['machineList'])): + if experiment_config['machineList'][index].get('port') is None: + experiment_config['machineList'][index]['port'] = 22 + except Exception as exception: raise Exception(exception) From 5ae146d7e6ab597e5c43a690c491a14e452be734 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 18 Oct 2018 10:40:48 +0800 Subject: [PATCH 11/34] add exception handling in trial_keeper.py --- tools/trial_tool/trial_keeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/trial_tool/trial_keeper.py b/tools/trial_tool/trial_keeper.py index 675a0566ac..ab1b42ac64 100644 --- a/tools/trial_tool/trial_keeper.py +++ b/tools/trial_tool/trial_keeper.py @@ -54,8 +54,8 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) try: + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): print('copy directory from {0} to {1} success!'.format(nni_local_output_dir, args.pai_hdfs_output_dir)) else: From 1dde461f882975860dff6c54bf6b264c9929733e Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 18 Oct 2018 16:12:19 +0800 Subject: [PATCH 12/34] fix port bug --- tools/nnicmd/nnictl_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index a400170cb3..0aa31cf635 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -54,7 +54,7 @@ def get_experiment_port(args): if not args.id: return list(experiment_dict.values())[0][0] if experiment_dict.get(args.id): - return experiment_dict[args.id] + return experiment_dict[args.id][0] else: print_error('Id not correct!') return None From 9fdf6d451219f535a7326b4533be20ddc0d4f35f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 16:51:03 +0800 Subject: [PATCH 13/34] fix resume --- tools/nnicmd/launcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index c9da0a4518..d2df91b660 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -305,7 +305,10 @@ def resume_experiment(args): '''resume an experiment''' nni_config = Config(args.port) experiment_config = nni_config.get_config('experimentConfig') - experiment_id = nni_config.get_config('experimentId') + if args.id: + experiment_id = args.id + else: + experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): From c1285f88b8797b5d4f9f4f128e5953bf84024682 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:18:09 +0800 Subject: [PATCH 14/34] fix nnictl resume and fix nnictl stop --- deployment/Dockerfile | 82 ++++++++++++++++++++++++++++++- deployment/Dockerfile.build.base | 83 -------------------------------- deployment/README.md | 6 +-- tools/nnicmd/launcher.py | 11 +++-- tools/nnicmd/nnictl.py | 4 +- tools/nnicmd/nnictl_utils.py | 2 +- 6 files changed, 90 insertions(+), 98 deletions(-) delete mode 100644 deployment/Dockerfile.build.base diff --git a/deployment/Dockerfile b/deployment/Dockerfile index d0ddf99587..8ad2632402 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -1,7 +1,85 @@ -FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' +ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + +RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + +# +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + +# +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + +ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + # #Tensorflow 1.10.0 # @@ -12,4 +90,4 @@ RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0 # RUN pip3 --no-cache-dir install Keras==2.1.6 -WORKDIR /root \ No newline at end of file +WORKDIR /root diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base deleted file mode 100644 index 56315a3b5f..0000000000 --- a/deployment/Dockerfile.build.base +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - -LABEL maintainer='Microsoft NNI Team' - -ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - -RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - -# -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - -# -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - -ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - -WORKDIR /root diff --git a/deployment/README.md b/deployment/README.md index 19b84cba3f..c9bd2e8175 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,8 +2,7 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. -Dockerfile could build the customized docker image, users could build their customized docker image using this file. +Dockerfile could build docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` @@ -17,6 +16,5 @@ NNI v0.1 ## 3 How to run - docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 . docker build -t nni/nni . - nvidia-docker run -it nni/nni \ No newline at end of file + nvidia-docker run -it nni/nni \ No newline at end of file diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index d2df91b660..208e48dc46 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,6 +34,7 @@ from .constants import * from .webui_utils import * import time +from .nnictl_utils import get_experiment_port def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' @@ -303,12 +304,12 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): def resume_experiment(args): '''resume an experiment''' - nni_config = Config(args.port) + port = get_experiment_port(args) + if port is None: + return None + nni_config = Config(port) experiment_config = nni_config.get_config('experimentConfig') - if args.id: - experiment_id = args.id - else: - experiment_id = nni_config.get_config('experimentId') + experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index a82891247f..d394dc51ca 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,9 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--experiment', '-e', dest='id', help='ID of the experiment you want to resume') - parser_resume.add_argument('--manager', '-m', default='nnimanager', dest='manager') - parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') + parser_resume.add_argument('--id', '-i', dest='id', help='ID of the experiment you want to resume') parser_resume.set_defaults(func=resume_experiment) # parse update command diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 0aa31cf635..a852b8f86c 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -137,6 +137,7 @@ def stop_experiment(args): rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): print_normal('Experiment is not running...') + experiment_config.remove_experiment(experiment_id) return running, _ = check_rest_server_quick(rest_port) stop_rest_result = True @@ -153,7 +154,6 @@ def stop_experiment(args): call(cmds) if stop_rest_result: print_normal('Stop experiment success!') - experiment_config = Experiments() experiment_config.remove_experiment(experiment_id) def trial_ls(args): From af0d081401b2fb9ba57b9e0bb5568a01741f9480 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:24:30 +0800 Subject: [PATCH 15/34] fix document --- docs/NNICTLDOC.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 8139f5b8c4..d123c69aff 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --experiment, -e| False| |ID of the experiment you want to resume| + | --id, -i| False| |ID of the experiment you want to resume| From 7ce8fd88ca9fb994187a5815cc37a8031b842be1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 17:37:29 +0800 Subject: [PATCH 16/34] update --- docs/NNICTLDOC.md | 2 +- tools/nnicmd/launcher.py | 6 +----- tools/nnicmd/nnictl.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index d123c69aff..6a54b72a6e 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to resume| + | --port, -p| False| |Rest port of the experiment you want to resume| diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 208e48dc46..c9da0a4518 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,7 +34,6 @@ from .constants import * from .webui_utils import * import time -from .nnictl_utils import get_experiment_port def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' @@ -304,10 +303,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): def resume_experiment(args): '''resume an experiment''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(args.port) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index d394dc51ca..27a301b8d9 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--id', '-i', dest='id', help='ID of the experiment you want to resume') + parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='Rest port of the experiment you want to resume') parser_resume.set_defaults(func=resume_experiment) # parse update command From b29aaed48c863ffe4db7ea316d933f3c173ad31d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 21:02:25 +0800 Subject: [PATCH 17/34] refactor nnictl --- docs/NNICTLDOC.md | 9 +- tools/nnicmd/config_utils.py | 17 +++- tools/nnicmd/constants.py | 4 +- tools/nnicmd/launcher.py | 42 +++++---- tools/nnicmd/nnictl.py | 6 +- tools/nnicmd/nnictl_utils.py | 177 +++++++++++++++++++---------------- tools/nnicmd/updater.py | 5 +- 7 files changed, 152 insertions(+), 108 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 6a54b72a6e..ee47e203c9 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -50,6 +50,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | --port, -p| False| |Rest port of the experiment you want to resume| + | --id, -i| True| |The id of the experiment you want to resume| @@ -192,11 +193,17 @@ nnictl webui * __nnictl experiment list__ * Description - Show the id and start time of all running experiments. + Show the information of all running experiments. * Usage nnictl experiment list + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | --all| False| False|Show all of experiments, including stopped experiments.| + * __nnictl config show__ diff --git a/tools/nnicmd/config_utils.py b/tools/nnicmd/config_utils.py index 9e1fb7ae91..b84e29ebbf 100644 --- a/tools/nnicmd/config_utils.py +++ b/tools/nnicmd/config_utils.py @@ -73,11 +73,24 @@ def __init__(self): self.experiment_file = os.path.join(NNICTL_HOME_DIR, '.experiment') self.experiments = self.read_file() - def add_experiment(self, id, port, time): + def add_experiment(self, id, port, time, file_name): '''set {key:value} paris to self.experiment''' - self.experiments[id] = [port, time] + self.experiments[id] = {} + self.experiments[id]['port'] = port + self.experiments[id]['startTime'] = time + self.experiments[id]['endTime'] = 'N/A' + self.experiments[id]['status'] = 'running' + self.experiments[id]['fileName'] = file_name self.write_file() + def update_experiment(self, id, key, value): + '''Update experiment''' + if id not in self.experiments: + return False + self.experiments[id][key] = value + self.write_file() + return True + def remove_experiment(self, id): '''remove an experiment by id''' if id in self.experiments: diff --git a/tools/nnicmd/constants.py b/tools/nnicmd/constants.py index 71c3d2112c..fec3b47b24 100644 --- a/tools/nnicmd/constants.py +++ b/tools/nnicmd/constants.py @@ -54,11 +54,13 @@ EXPERIMENT_START_FAILED_INFO = 'There is an experiment running in the port %d, please stop it first or set another port!\n' \ 'You could use \'nnictl stop --port [PORT]\' command to stop an experiment!\nOr you could use \'nnictl create --config [CONFIG_PATH] --port [PORT]\' to set port!\n' -EXPERIMENT_ID_INFO = '-----------------------------------------------------------------------\n' \ +EXPERIMENT_INFORMATION_FORMAT = '-----------------------------------------------------------------------\n' \ ' Experiment information\n' \ '%s\n' \ '-----------------------------------------------------------------------\n' +EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s StartTime: %s EndTime: %s \n' + PACKAGE_REQUIREMENTS = { 'SMAC': 'smac_tuner' } diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index c9da0a4518..645caaa60f 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -34,17 +34,15 @@ from .constants import * from .webui_utils import * import time +import random +import string + +CONFIG_FILE_NAME = ''.join(random.sample(string.ascii_letters + string.digits, 8)) def start_rest_server(port, platform, mode, experiment_id=None): '''Run nni manager process''' + global CONFIG_FILE_NAME print_normal('Checking environment...') - nni_config = Config(port) - rest_port = nni_config.get_config('restServerPort') - running, _ = check_rest_server_quick(rest_port) - if rest_port and running: - print_error(EXPERIMENT_START_FAILED_INFO % port) - exit(1) - if detect_port(port): print_error('Port %s is used by another process, please reset the port!' % port) exit(1) @@ -54,8 +52,8 @@ def start_rest_server(port, platform, mode, experiment_id=None): cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - stdout_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stdout') - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stdout_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stdout') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') stdout_file = open(stdout_full_path, 'a+') stderr_file = open(stderr_full_path, 'a+') time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) @@ -89,7 +87,7 @@ def set_trial_config(experiment_config, port): return True else: print('Error message is {}'.format(response.text)) - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) return False @@ -108,7 +106,7 @@ def set_remote_config(experiment_config, port): if not response or not check_response(response): if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message @@ -125,7 +123,7 @@ def set_pai_config(experiment_config, port): if not response or not response.status_code == 200: if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message @@ -191,7 +189,7 @@ def set_experiment(experiment_config, mode, port): if check_response(response): return response else: - stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) print_error('Setting experiment error, error message is {}'.format(response.text)) @@ -199,7 +197,8 @@ def set_experiment(experiment_config, mode, port): def launch_experiment(args, experiment_config, mode, experiment_id=None): '''follow steps to start rest server and start experiment''' - nni_config = Config(args.port) + global CONFIG_FILE_NAME + nni_config = Config(CONFIG_FILE_NAME) # start rest server rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id) nni_config.set_config('restServerPid', rest_process.pid) @@ -297,20 +296,29 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): #save experiment information experiment_config = Experiments() - experiment_config.add_experiment(experiment_id, args.port, start_time) + experiment_config.add_experiment(experiment_id, args.port, start_time, CONFIG_FILE_NAME) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) def resume_experiment(args): '''resume an experiment''' - nni_config = Config(args.port) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + if experiment_dict.get(args.id) is None: + print_error('Id not exist!') + exit(1) + if experiment_dict[args.id]['status'] == 'running': + print_error('Experiment %s is running!' % args.id) + exit(1) + nni_config = Config(experiment_dict[args.id]['fileName']) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') launch_experiment(args, experiment_config, 'resume', experiment_id) def create_experiment(args): '''start a new experiment''' - nni_config = Config(args.port) + global CONFIG_FILE_NAME + nni_config = Config(CONFIG_FILE_NAME) config_path = os.path.abspath(args.config) if not os.path.exists(config_path): print_error('Please set correct config path!') diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 27a301b8d9..3328903866 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,8 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='Rest port of the experiment you want to resume') + parser_resume.add_argument('--id', '-i', dest='id', required=True, help='The id of the experiment you want to resume') + parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.set_defaults(func=resume_experiment) # parse update command @@ -93,7 +94,8 @@ def parse_args(): parser_experiment_status.add_argument('--id', '-i', dest='id', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') - parser_experiment_list.set_defaults(func=experiment_id) + parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') + parser_experiment_list.set_defaults(func=experiment_list) #TODO:finish webui function #parse board command diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index a852b8f86c..4931c1b954 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -22,42 +22,64 @@ import psutil import json import datetime +import time from subprocess import call, check_output from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .config_utils import Config, Experiments from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url -from .constants import NNICTL_HOME_DIR, EXPERIMENT_ID_INFO +from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT import time -from .common_utils import print_normal, print_error, detect_process +from .common_utils import print_normal, print_error, print_warning, detect_process -def get_experiment_port(args): - '''get the port of an experiment''' +def check_experiment_id(args): + '''check if the id is valid + 1.If there is an id specified, return the corresponding port + 2.If there is no id specified, and there is an experiment running, return it as default port, or return Error + 3.If the id matches an experiment, nnictl will return the id. + 4.If the id ends with *, nnictl will match all ids matchs the regular + 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id + 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information + 7.Users could use 'nnictl stop all' to stop all experiments + ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - #1.If there is an id specified, return the corresponding port - #2.If there is no id specified, and there is an experiment running, return it as default port, or return Error - #3.If the id matches an experiment, nnictl will return the id. - #4.If the id ends with *, nnictl will match all ids matchs the regular - #5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id - #6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information - #7.Users could use 'nnictl stop all' to stop all experiments if not experiment_dict: - print_normal('Experiment is not running...') - return None - if not args.id and len(experiment_dict.keys()) > 1: - print_error('There are multiple experiments running, please set the experiment id...') - experiment_information = "" - for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) - return None + print_normal('There is no experiment running...') + exit(1) if not args.id: - return list(experiment_dict.values())[0][0] + running_experiment_list = [] + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'running': + running_experiment_list.append(key) + if len(running_experiment_list) > 1: + print_error('There are multiple experiments running, please set the experiment id...') + experiment_information = "" + for key in running_experiment_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) + exit(1) + else: + return None if experiment_dict.get(args.id): - return experiment_dict[args.id][0] - else: - print_error('Id not correct!') return None + else: + print_error('Id not correct!') + exit(1) + +def get_config_filename(args): + '''get the file name of config file''' + check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[args.id]['fileName'] + +def get_experiment_port(args): + '''get the port of experiment''' + check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[args.id]['port'] def convert_time_stamp_to_date(content): '''Convert time stamp to date time format''' @@ -73,10 +95,7 @@ def convert_time_stamp_to_date(content): def check_rest(args): '''check if restful server is running''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') running, _ = check_rest_server_quick(rest_port) if not running: @@ -91,27 +110,32 @@ def parse_ids(args): if not experiment_dict: print_normal('Experiment is not running...') return None - experiment_id_list = list(experiment_dict.keys()) result_list = [] + running_experiment_list = [] + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'running': + running_experiment_list.append(key) if not args.id: - if len(experiment_id_list) > 1: + if len(running_experiment_list) > 1: print_error('There are multiple experiments running, please set the experiment id...') experiment_information = "" - for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) - return None - result_list = experiment_id_list + for key in running_experiment_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) + exit(1) + else: + result_list = running_experiment_list elif args.id == 'all': - result_list = experiment_id_list + result_list = running_experiment_list elif args.id.endswith('*'): - for id in experiment_id_list: + for id in running_experiment_list: if id.startswith(args.id[:-1]): result_list.append(id) - elif args.id in experiment_id_list: + elif args.id in running_experiment_list: result_list.append(args.id) else: - for id in experiment_id_list: + for id in running_experiment_list: if id.startswith(args.id): result_list.append(id) if len(result_list) > 1: @@ -128,16 +152,13 @@ def stop_experiment(args): experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() for experiment_id in experiment_id_list: - port = experiment_dict.get(experiment_id)[0] - if port is None: - return None print_normal('Stoping experiment %s' % experiment_id) - nni_config = Config(port) + nni_config = Config(experiment_dict[experiment_id]['fileName']) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): print_normal('Experiment is not running...') - experiment_config.remove_experiment(experiment_id) + experiment_config.update_experiment(experiment_id, 'status', 'stopped') return running, _ = check_rest_server_quick(rest_port) stop_rest_result = True @@ -154,14 +175,13 @@ def stop_experiment(args): call(cmds) if stop_rest_result: print_normal('Stop experiment success!') - experiment_config.remove_experiment(experiment_id) + experiment_config.update_experiment(experiment_id, 'status', 'stopped') + time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) + experiment_config.update_experiment(experiment_id, 'endTime', str(time_now)) def trial_ls(args): '''List trial''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -182,10 +202,7 @@ def trial_ls(args): def trial_kill(args): '''List trial''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -203,10 +220,7 @@ def trial_kill(args): def list_experiment(args): '''Get experiment information''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -225,10 +239,7 @@ def list_experiment(args): def experiment_status(args): '''Show the status of experiment''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') result, response = check_rest_server_quick(rest_port) if not result: @@ -246,13 +257,11 @@ def get_log_content(file_name, cmds): def log_internal(args, filetype): '''internal function to call get_log_content''' - port = get_experiment_port(args) - if port is None: - return None + file_name = get_config_filename(args) if filetype == 'stdout': - file_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stdout') + file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stdout') else: - file_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr') + file_full_path = os.path.join(NNICTL_HOME_DIR, file_name, 'stderr') if args.head: get_log_content(file_full_path, ['head', '-' + str(args.head), file_full_path]) elif args.tail: @@ -273,10 +282,7 @@ def log_stderr(args): def log_trial(args): ''''get trial log path''' trial_id_path_dict = {} - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') rest_pid = nni_config.get_config('restServerPid') if not detect_process(rest_pid): @@ -304,28 +310,33 @@ def log_trial(args): def get_config(args): '''get config info''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) print(nni_config.get_all_config()) def webui_url(args): '''show the url of web ui''' - port = get_experiment_port(args) - if port is None: - return None - nni_config = Config(port) + nni_config = Config(get_config_filename(args)) print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) -def experiment_id(args): - '''get the id of all experiments''' +def experiment_list(args): + '''get the information of all experiments''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() if not experiment_dict: print('There is no experiment running...') + exit(1) + experiment_id_list = [] + if args.all: + for key in experiment_dict.keys(): + experiment_id_list.append(key) else: - experiment_information = "" for key in experiment_dict.keys(): - experiment_information += ('Id: ' + key + ' StartTime: ' + experiment_dict[key][1] + '\n') - print(EXPERIMENT_ID_INFO % experiment_information) \ No newline at end of file + if experiment_dict[key]['status'] == 'running': + experiment_id_list.append(key) + if not experiment_id_list: + print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments!') + experiment_information = "" + for key in experiment_id_list: + experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ + experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) + print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) diff --git a/tools/nnicmd/updater.py b/tools/nnicmd/updater.py index 00291fc61f..d6e9bd15ad 100644 --- a/tools/nnicmd/updater.py +++ b/tools/nnicmd/updater.py @@ -25,6 +25,7 @@ from .url_utils import experiment_url from .config_utils import Config from .common_utils import get_json_content +from .nnictl_utils import check_experiment_id, get_experiment_port, get_config_filename def validate_digit(value, start, end): '''validate if a digit is valid''' @@ -56,7 +57,7 @@ def get_query_type(key): def update_experiment_profile(args, key, value): '''call restful server to update experiment profile''' - nni_config = Config(args.port) + nni_config = Config(get_config_filename(args)) rest_port = nni_config.get_config('restServerPort') running, _ = check_rest_server_quick(rest_port) if running: @@ -95,7 +96,7 @@ def update_duration(args): def update_trialnum(args): validate_digit(args.value, 1, 999999999) - if update_experiment_profile('maxTrialNum', int(args.value)): + if update_experiment_profile(args, 'maxTrialNum', int(args.value)): print('INFO: update %s success!' % 'trialnum') else: print('ERROR: update %s failed!' % 'trialnum') \ No newline at end of file From 683833bd3053a1e6790644c27a2b7ce6f3b7f41b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 19 Oct 2018 21:19:58 +0800 Subject: [PATCH 18/34] update --- tools/nnicmd/nnictl_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 4931c1b954..21ab6128b2 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -60,26 +60,26 @@ def check_experiment_id(args): print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) else: - return None + return running_experiment_list[0] if experiment_dict.get(args.id): - return None + return args.id else: print_error('Id not correct!') exit(1) def get_config_filename(args): '''get the file name of config file''' - check_experiment_id(args) + experiment_id = check_experiment_id(args) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[args.id]['fileName'] + return experiment_dict[experiment_id]['fileName'] def get_experiment_port(args): '''get the port of experiment''' - check_experiment_id(args) + experiment_id = check_experiment_id(args) experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[args.id]['port'] + return experiment_dict[experiment_id]['port'] def convert_time_stamp_to_date(content): '''Convert time stamp to date time format''' From 6149bf96fdaa446eb18235b74520642c911fc763 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 22 Oct 2018 10:49:56 +0800 Subject: [PATCH 19/34] update doc --- docs/NNICTLDOC.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index ee47e203c9..c10086d6eb 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,8 +49,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | --id, -i| True| |The id of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume| - | --id, -i| True| |The id of the experiment you want to resume| @@ -193,7 +193,7 @@ nnictl webui * __nnictl experiment list__ * Description - Show the information of all running experiments. + Show the information of all the (running) experiments. * Usage nnictl experiment list From 73bef2f7452b5aa1727e1301d65af2169aed5d9d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 22 Oct 2018 16:09:24 +0800 Subject: [PATCH 20/34] update --- tools/nnicmd/nnictl.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 3328903866..4c17ebcaf0 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -45,7 +45,7 @@ def parse_args(): # parse resume command parser_resume = subparsers.add_parser('resume', help='resume a new experiment') - parser_resume.add_argument('--id', '-i', dest='id', required=True, help='The id of the experiment you want to resume') + parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume') parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server') parser_resume.set_defaults(func=resume_experiment) @@ -54,15 +54,15 @@ def parse_args(): #add subparsers for parser_updater parser_updater_subparsers = parser_updater.add_subparsers() parser_updater_searchspace = parser_updater_subparsers.add_parser('searchspace', help='update searchspace') - parser_updater_searchspace.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_searchspace.add_argument('id', nargs='?', help='the id of experiment') parser_updater_searchspace.add_argument('--filename', '-f', required=True) parser_updater_searchspace.set_defaults(func=update_searchspace) parser_updater_concurrency = parser_updater_subparsers.add_parser('concurrency', help='update concurrency') - parser_updater_concurrency.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_concurrency.add_argument('id', nargs='?', help='the id of experiment') parser_updater_concurrency.add_argument('--value', '-v', required=True) parser_updater_concurrency.set_defaults(func=update_concurrency) parser_updater_duration = parser_updater_subparsers.add_parser('duration', help='update duration') - parser_updater_duration.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_updater_duration.add_argument('id', nargs='?', help='the id of experiment') parser_updater_duration.add_argument('--value', '-v', required=True) parser_updater_duration.set_defaults(func=update_duration) @@ -76,10 +76,10 @@ def parse_args(): #add subparsers for parser_trial parser_trial_subparsers = parser_trial.add_subparsers() parser_trial_ls = parser_trial_subparsers.add_parser('ls', help='list trial jobs') - parser_trial_ls.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_trial_ls.add_argument('id', nargs='?', help='the id of experiment') parser_trial_ls.set_defaults(func=trial_ls) parser_trial_kill = parser_trial_subparsers.add_parser('kill', help='kill trial jobs') - parser_trial_kill.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_trial_kill.add_argument('id', nargs='?', help='the id of experiment') parser_trial_kill.add_argument('--trialid', '-t', required=True, dest='trialid', help='the id of trial to be killed') parser_trial_kill.set_defaults(func=trial_kill) @@ -88,10 +88,10 @@ def parse_args(): #add subparsers for parser_experiment parser_experiment_subparsers = parser_experiment.add_subparsers() parser_experiment_show = parser_experiment_subparsers.add_parser('show', help='show the information of experiment') - parser_experiment_show.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_experiment_show.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_show.set_defaults(func=list_experiment) parser_experiment_status = parser_experiment_subparsers.add_parser('status', help='show the status of experiment') - parser_experiment_status.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_experiment_status.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') @@ -103,14 +103,14 @@ def parse_args(): #add subparsers for parser_board parser_webui_subparsers = parser_webui.add_subparsers() parser_webui_url = parser_webui_subparsers.add_parser('url', help='show the url of web ui') - parser_webui_url.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_webui_url.add_argument('id', nargs='?', help='the id of experiment') parser_webui_url.set_defaults(func=webui_url) #parse config command parser_config = subparsers.add_parser('config', help='get config information') parser_config_subparsers = parser_config.add_subparsers() parser_config_show = parser_config_subparsers.add_parser('show', help='show the information of config') - parser_config_show.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_config_show.add_argument('id', nargs='?', help='the id of experiment') parser_config_show.set_defaults(func=get_config) #parse log command @@ -118,19 +118,19 @@ def parse_args(): # add subparsers for parser_log parser_log_subparsers = parser_log.add_subparsers() parser_log_stdout = parser_log_subparsers.add_parser('stdout', help='get stdout information') - parser_log_stdout.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_stdout.add_argument('id', nargs='?', help='the id of experiment') parser_log_stdout.add_argument('--tail', '-T', dest='tail', type=int, help='get tail -100 content of stdout') parser_log_stdout.add_argument('--head', '-H', dest='head', type=int, help='get head -100 content of stdout') parser_log_stdout.add_argument('--path', action='store_true', default=False, help='get the path of stdout file') parser_log_stdout.set_defaults(func=log_stdout) parser_log_stderr = parser_log_subparsers.add_parser('stderr', help='get stderr information') - parser_log_stderr.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_stderr.add_argument('id', nargs='?', help='the id of experiment') parser_log_stderr.add_argument('--tail', '-T', dest='tail', type=int, help='get tail -100 content of stderr') parser_log_stderr.add_argument('--head', '-H', dest='head', type=int, help='get head -100 content of stderr') parser_log_stderr.add_argument('--path', action='store_true', default=False, help='get the path of stderr file') parser_log_stderr.set_defaults(func=log_stderr) parser_log_trial = parser_log_subparsers.add_parser('trial', help='get trial log path') - parser_log_trial.add_argument('--id', '-i', dest='id', help='the id of experiment') + parser_log_trial.add_argument('id', nargs='?', help='the id of experiment') parser_log_trial.add_argument('--trialid', '-T', dest='trialid', help='find trial log path by id') parser_log_trial.set_defaults(func=log_trial) From 5c397f6e5655afebcc99c504cec795364d714c25 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 15:39:46 +0800 Subject: [PATCH 21/34] update nnictl --- tools/nnicmd/config_utils.py | 4 +- tools/nnicmd/launcher.py | 105 ++++++++++++++++++++++------------- tools/nnicmd/nnictl.py | 4 +- tools/nnicmd/nnictl_utils.py | 95 ++++++++++++++++--------------- tools/nnicmd/webui_utils.py | 4 +- 5 files changed, 120 insertions(+), 92 deletions(-) diff --git a/tools/nnicmd/config_utils.py b/tools/nnicmd/config_utils.py index b84e29ebbf..17adb05fd6 100644 --- a/tools/nnicmd/config_utils.py +++ b/tools/nnicmd/config_utils.py @@ -26,8 +26,8 @@ class Config: '''a util class to load and save config''' - def __init__(self, port): - config_path = os.path.join(NNICTL_HOME_DIR, str(port)) + def __init__(self, file_path): + config_path = os.path.join(NNICTL_HOME_DIR, str(file_path)) os.makedirs(config_path, exist_ok=True) self.config_file = os.path.join(config_path, '.config') self.config = self.read_file() diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 645caaa60f..676a2946c6 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -37,12 +37,9 @@ import random import string -CONFIG_FILE_NAME = ''.join(random.sample(string.ascii_letters + string.digits, 8)) - -def start_rest_server(port, platform, mode, experiment_id=None): +def start_rest_server(port, platform, mode, config_file_name, experiment_id=None): '''Run nni manager process''' - global CONFIG_FILE_NAME - print_normal('Checking environment...') + nni_config = Config(config_file_name) if detect_port(port): print_error('Port %s is used by another process, please reset the port!' % port) exit(1) @@ -52,8 +49,9 @@ def start_rest_server(port, platform, mode, experiment_id=None): cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - stdout_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stdout') - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + print(cmds) + stdout_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stdout') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') stdout_file = open(stdout_full_path, 'a+') stderr_file = open(stderr_full_path, 'a+') time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) @@ -64,7 +62,7 @@ def start_rest_server(port, platform, mode, experiment_id=None): process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) return process, str(time_now) -def set_trial_config(experiment_config, port): +def set_trial_config(experiment_config, port, config_file_name): '''set trial configuration''' request_data = dict() value_dict = dict() @@ -87,16 +85,16 @@ def set_trial_config(experiment_config, port): return True else: print('Error message is {}'.format(response.text)) - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) return False -def set_local_config(experiment_config, port): +def set_local_config(experiment_config, port, config_file_name): '''set local configuration''' - return set_trial_config(experiment_config, port) + return set_trial_config(experiment_config, port, config_file_name) -def set_remote_config(experiment_config, port): +def set_remote_config(experiment_config, port, config_file_name): '''Call setClusterMetadata to pass trial''' #set machine_list request_data = dict() @@ -106,15 +104,15 @@ def set_remote_config(experiment_config, port): if not response or not check_response(response): if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config - return set_trial_config(experiment_config, port), err_message + return set_trial_config(experiment_config, port, config_file_name), err_message -def set_pai_config(experiment_config, port): +def set_pai_config(experiment_config, port, config_file_name): '''set pai configuration''' pai_config_data = dict() pai_config_data['pai_config'] = experiment_config['paiConfig'] @@ -123,15 +121,15 @@ def set_pai_config(experiment_config, port): if not response or not response.status_code == 200: if response is not None: err_message = response.text - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) return False, err_message #set trial_config - return set_trial_config(experiment_config, port), err_message + return set_trial_config(experiment_config, port, config_file_name), err_message -def set_experiment(experiment_config, mode, port): +def set_experiment(experiment_config, mode, port, config_file_name): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() request_data['authorName'] = experiment_config['authorName'] @@ -189,18 +187,17 @@ def set_experiment(experiment_config, mode, port): if check_response(response): return response else: - stderr_full_path = os.path.join(NNICTL_HOME_DIR, CONFIG_FILE_NAME, 'stderr') + stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') with open(stderr_full_path, 'a+') as fout: fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':'))) print_error('Setting experiment error, error message is {}'.format(response.text)) return None -def launch_experiment(args, experiment_config, mode, experiment_id=None): +def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None): '''follow steps to start rest server and start experiment''' - global CONFIG_FILE_NAME - nni_config = Config(CONFIG_FILE_NAME) + nni_config = Config(config_file_name) # start rest server - rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id) + rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id) nni_config.set_config('restServerPid', rest_process.pid) # Deal with annotation if experiment_config.get('useAnnotation'): @@ -235,7 +232,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # set remote config if experiment_config['trainingServicePlatform'] == 'remote': print_normal('Setting remote config...') - config_result, err_msg = set_remote_config(experiment_config, args.port) + config_result, err_msg = set_remote_config(experiment_config, args.port, config_file_name) if config_result: print_normal('Successfully set remote config!') else: @@ -250,7 +247,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # set local config if experiment_config['trainingServicePlatform'] == 'local': print_normal('Setting local config...') - if set_local_config(experiment_config, args.port): + if set_local_config(experiment_config, args.port, config_file_name): print_normal('Successfully set local config!') else: print_error('Failed!') @@ -264,7 +261,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): #set pai config if experiment_config['trainingServicePlatform'] == 'pai': print_normal('Setting pai config...') - config_result, err_msg = set_pai_config(experiment_config, args.port) + config_result, err_msg = set_pai_config(experiment_config, args.port, config_file_name) if config_result: print_normal('Successfully set pai config!') else: @@ -279,7 +276,7 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): # start a new experiment print_normal('Starting experiment...') - response = set_experiment(experiment_config, mode, args.port) + response = set_experiment(experiment_config, mode, args.port, config_file_name) if response: if experiment_id is None: experiment_id = json.loads(response.text).get('experiment_id') @@ -292,33 +289,61 @@ def launch_experiment(args, experiment_config, mode, experiment_id=None): except Exception: raise Exception(ERROR_INFO % 'Restful server stopped!') exit(1) - web_ui_url_list = get_web_ui_urls(args.port) + web_ui_url_list = get_web_ui_urls(args.port, config_file_name) #save experiment information experiment_config = Experiments() - experiment_config.add_experiment(experiment_id, args.port, start_time, CONFIG_FILE_NAME) + experiment_config.add_experiment(experiment_id, args.port, start_time, config_file_name) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) +def cmp_time(time1, time2): + '''compare the time''' + try: + time1 = time.strptime(time1,'%Y-%m-%d %H:%M:%S') + time2 = time.strptime(time2,'%Y-%m-%d %H:%M:%S') + return int(time1) - int(time2) + except: + return 0 + def resume_experiment(args): '''resume an experiment''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() - if experiment_dict.get(args.id) is None: - print_error('Id not exist!') - exit(1) - if experiment_dict[args.id]['status'] == 'running': - print_error('Experiment %s is running!' % args.id) - exit(1) - nni_config = Config(experiment_dict[args.id]['fileName']) + experiment_id = None + experiment_endTime = None + #find the latest stopped experiment + if not args.id: + for key in experiment_dict.keys(): + if experiment_dict[key]['status'] == 'stopped': + if experiment_id is None: + experiment_id = key + experiment_endTime = experiment_dict[key]['endTime'] + else: + if cmp_time(experiment_dict[key]['endTime'], experiment_endTime) > 0: + experiment_id = key + experiment_endTime = experiment_dict[key]['endTime'] + if experiment_id is None: + print_error('There is no experiment stopped!') + exit(1) + else: + if experiment_dict.get(args.id) is None: + print_error('Id not exist!') + exit(1) + if experiment_dict[args.id]['status'] == 'running': + print_error('Experiment %s is running!' % args.id) + exit(1) + experiment_id = args.id + print_normal('Resuming experiment %s...' % experiment_id) + nni_config = Config(experiment_dict[experiment_id]['fileName']) experiment_config = nni_config.get_config('experimentConfig') experiment_id = nni_config.get_config('experimentId') - launch_experiment(args, experiment_config, 'resume', experiment_id) + launch_experiment(args, experiment_config, 'resume', experiment_dict[experiment_id]['fileName'], experiment_id) def create_experiment(args): '''start a new experiment''' - global CONFIG_FILE_NAME - nni_config = Config(CONFIG_FILE_NAME) + config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8)) + nni_config = Config(config_file_name) config_path = os.path.abspath(args.config) if not os.path.exists(config_path): print_error('Please set correct config path!') @@ -327,5 +352,5 @@ def create_experiment(args): validate_all_content(experiment_config, config_path) nni_config.set_config('experimentConfig', experiment_config) - launch_experiment(args, experiment_config, 'new') + launch_experiment(args, experiment_config, 'new', config_file_name) nni_config.set_config('restServerPort', args.port) diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index 4c17ebcaf0..da56a6a763 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -94,7 +94,7 @@ def parse_args(): parser_experiment_status.add_argument('id', nargs='?', help='the id of experiment') parser_experiment_status.set_defaults(func=experiment_status) parser_experiment_list = parser_experiment_subparsers.add_parser('list', help='list all of running experiment ids') - parser_experiment_list.add_argument('--all', action='store_true', default=False, help='list all of experiments') + parser_experiment_list.add_argument('all', nargs='?', help='list all of experiments') parser_experiment_list.set_defaults(func=experiment_list) #TODO:finish webui function @@ -140,7 +140,7 @@ def parse_args(): parser_package_subparsers = parser_package.add_subparsers() parser_package_install = parser_package_subparsers.add_parser('install', help='install packages') parser_package_install.add_argument('--name', '-n', dest='name', help='package name to be installed') - parser_package_install.set_defaults(func=package_install) + parser_package_install.set_defaults(func=package_install) parser_package_show = parser_package_subparsers.add_parser('show', help='show the information of packages') parser_package_show.set_defaults(func=package_show) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 21ab6128b2..d4d99309fd 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -33,13 +33,6 @@ def check_experiment_id(args): '''check if the id is valid - 1.If there is an id specified, return the corresponding port - 2.If there is no id specified, and there is an experiment running, return it as default port, or return Error - 3.If the id matches an experiment, nnictl will return the id. - 4.If the id ends with *, nnictl will match all ids matchs the regular - 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id - 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information - 7.Users could use 'nnictl stop all' to stop all experiments ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() @@ -59,6 +52,9 @@ def check_experiment_id(args): experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) exit(1) + elif not running_experiment_list: + print_error('There is no experiment running!') + exit(1) else: return running_experiment_list[0] if experiment_dict.get(args.id): @@ -67,44 +63,15 @@ def check_experiment_id(args): print_error('Id not correct!') exit(1) -def get_config_filename(args): - '''get the file name of config file''' - experiment_id = check_experiment_id(args) - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[experiment_id]['fileName'] - -def get_experiment_port(args): - '''get the port of experiment''' - experiment_id = check_experiment_id(args) - experiment_config = Experiments() - experiment_dict = experiment_config.get_all_experiments() - return experiment_dict[experiment_id]['port'] - -def convert_time_stamp_to_date(content): - '''Convert time stamp to date time format''' - start_time_stamp = content.get('startTime') - end_time_stamp = content.get('endTime') - if start_time_stamp: - start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") - content['startTime'] = str(start_time) - if end_time_stamp: - end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") - content['endTime'] = str(end_time) - return content - -def check_rest(args): - '''check if restful server is running''' - nni_config = Config(get_config_filename(args)) - rest_port = nni_config.get_config('restServerPort') - running, _ = check_rest_server_quick(rest_port) - if not running: - print_normal('Restful server is running...') - else: - print_normal('Restful server is not running...') - def parse_ids(args): - '''Parse the arguments for nnictl stop''' + '''Parse the arguments for nnictl stop + 1.If there is an id specified, return the corresponding id + 2.If there is no id specified, and there is an experiment running, return the id, or return Error + 3.If the id matches an experiment, nnictl will return the id. + 4.If the id ends with *, nnictl will match all ids matchs the regular + 5.If the id does not exist but match the prefix of an experiment id, nnictl will return the matched id + 6.If the id does not exist but match multiple prefix of the experiment ids, nnictl will give id information + ''' experiment_config = Experiments() experiment_dict = experiment_config.get_all_experiments() if not experiment_dict: @@ -145,6 +112,42 @@ def parse_ids(args): print_error('There are no experiments matched, please check experiment id...') return result_list +def get_config_filename(args): + '''get the file name of config file''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[experiment_id]['fileName'] + +def get_experiment_port(args): + '''get the port of experiment''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + return experiment_dict[experiment_id]['port'] + +def convert_time_stamp_to_date(content): + '''Convert time stamp to date time format''' + start_time_stamp = content.get('startTime') + end_time_stamp = content.get('endTime') + if start_time_stamp: + start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") + content['startTime'] = str(start_time) + if end_time_stamp: + end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") + content['endTime'] = str(end_time) + return content + +def check_rest(args): + '''check if restful server is running''' + nni_config = Config(get_config_filename(args)) + rest_port = nni_config.get_config('restServerPort') + running, _ = check_rest_server_quick(rest_port) + if not running: + print_normal('Restful server is running...') + else: + print_normal('Restful server is not running...') + def stop_experiment(args): '''Stop the experiment which is running''' experiment_id_list = parse_ids(args) @@ -326,7 +329,7 @@ def experiment_list(args): print('There is no experiment running...') exit(1) experiment_id_list = [] - if args.all: + if args.all and args.all == 'all': for key in experiment_dict.keys(): experiment_id_list.append(key) else: @@ -334,7 +337,7 @@ def experiment_list(args): if experiment_dict[key]['status'] == 'running': experiment_id_list.append(key) if not experiment_id_list: - print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments!') + print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!') experiment_information = "" for key in experiment_id_list: experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 89a5c2cf9d..69c374aebd 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -22,12 +22,12 @@ from socket import AddressFamily from .config_utils import Config -def get_web_ui_urls(port): +def get_web_ui_urls(port, CONFIG_FILE_NAME): webui_url_list = [] for name, info in psutil.net_if_addrs().items(): for addr in info: if AddressFamily.AF_INET == addr.family: webui_url_list.append('http://{}:{}'.format(addr.address, port)) - nni_config = Config(port) + nni_config = Config(CONFIG_FILE_NAME) nni_config.set_config('webuiUrl', webui_url_list) return webui_url_list From 2c68171069da5eb5f139b170238db13594032ef1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 16:32:01 +0800 Subject: [PATCH 22/34] fix comment --- tools/nnicmd/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 676a2946c6..d0da1031ea 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -328,7 +328,7 @@ def resume_experiment(args): exit(1) else: if experiment_dict.get(args.id) is None: - print_error('Id not exist!') + print_error('Id %s not exist!' % args.id) exit(1) if experiment_dict[args.id]['status'] == 'running': print_error('Experiment %s is running!' % args.id) From 8d14ca9775825f1f8160f9924f9716942c471247 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:25:09 +0800 Subject: [PATCH 23/34] revert dockerfile --- deployment/Dockerfile | 60 +------------------------- deployment/Dockerfile.build.base | 72 ++++++++++++++++++++++++++++++++ docs/NNICTLDOC.md | 31 +++++++------- 3 files changed, 89 insertions(+), 74 deletions(-) create mode 100644 deployment/Dockerfile.build.base diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 8ad2632402..c8686f9365 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -18,68 +18,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 +FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' -ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - -RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - -# -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - -# -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - -ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - -ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - # #Tensorflow 1.10.0 # diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base new file mode 100644 index 0000000000..9dffcfc428 --- /dev/null +++ b/deployment/Dockerfile.build.base @@ -0,0 +1,72 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + LABEL maintainer='Microsoft NNI Team' + ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + # +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + # +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + WORKDIR /root \ No newline at end of file diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index c10086d6eb..705bbc1ef5 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -49,7 +49,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| True| |The id of the experiment you want to resume| + | id| False| |The id of the experiment you want to resume| | --port, -p| False| |Rest port of the experiment you want to resume| @@ -88,8 +88,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --filename, -f| True| |the file storing your new search space| - | --id, -i| False| |ID of the experiment you want to set| * __nnictl update concurrency__ * Description @@ -104,8 +104,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --value, -v| True| |the number of allowed concurrent trials| - | --id, -i| False| |ID of the experiment you want to set| * __nnictl update duration__ * Description @@ -120,8 +120,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| + | --value, -v| True| |the experiment duration will be NUMBER seconds. SUFFIX may be 's' for seconds (the default), 'm' for minutes, 'h' for hours or 'd' for days.| * __nnictl trial__ @@ -138,7 +138,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl trial kill__ * Description @@ -152,9 +152,8 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --trialid, -t| True| |ID of the trial you want to kill.| - | --id, -i| False| |ID of the experiment you want to set| - @@ -172,7 +171,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl experiment status__ @@ -187,7 +186,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| + | id| False| |ID of the experiment you want to set| * __nnictl experiment list__ @@ -202,7 +201,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --all| False| False|Show all of experiments, including stopped experiments.| + | all| False| False|Show all of experiments, including stopped experiments.| @@ -230,10 +229,11 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --head, -h| False| |show head lines of stdout| | --tail, -t| False| |show tail lines of stdout| | --path, -p| False| |show the path of stdout file| - | --id, -i| False| |ID of the experiment you want to set| + * __nnictl log stderr__ * Description @@ -248,10 +248,11 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| | --head, -h| False| |show head lines of stderr| | --tail, -t| False| |show tail lines of stderr| | --path, -p| False| |show the path of stderr file| - | --id, -i| False| |ID of the experiment you want to set| + * __nnictl log trial__ * Description @@ -266,7 +267,7 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -I| False| |the id of trial| + | id| False| |the id of trial| ### Manage webui @@ -283,4 +284,4 @@ nnictl webui | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | - | --id, -i| False| |ID of the experiment you want to set| \ No newline at end of file + | id| False| |ID of the experiment you want to set| \ No newline at end of file From aeb7c665357a87391cc1fe3687a02c96fd11d076 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:26:53 +0800 Subject: [PATCH 24/34] update --- deployment/Dockerfile | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/deployment/Dockerfile b/deployment/Dockerfile index c8686f9365..d0ddf99587 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -1,23 +1,3 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 LABEL maintainer='Microsoft NNI Team' @@ -32,4 +12,4 @@ RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0 # RUN pip3 --no-cache-dir install Keras==2.1.6 -WORKDIR /root +WORKDIR /root \ No newline at end of file From a25654984f0d3bb76346af2ef4bf9dc36f3f00f0 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:29:08 +0800 Subject: [PATCH 25/34] update --- deployment/Dockerfile.build.base | 155 +++++++++++++++++-------------- deployment/README.md | 6 +- 2 files changed, 87 insertions(+), 74 deletions(-) diff --git a/deployment/Dockerfile.build.base b/deployment/Dockerfile.build.base index 9dffcfc428..56315a3b5f 100644 --- a/deployment/Dockerfile.build.base +++ b/deployment/Dockerfile.build.base @@ -1,72 +1,83 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 - LABEL maintainer='Microsoft NNI Team' - ENV HADOOP_VERSION=2.7.2 -LABEL HADOOP_VERSION=2.7.2 - RUN DEBIAN_FRONTEND=noninteractive && \ - apt-get -y update && \ - apt-get -y install sudo \ - apt-utils \ - git \ - curl \ - vim \ - unzip \ - wget \ - build-essential \ - cmake \ - libopenblas-dev \ - automake \ - openjdk-8-jdk \ - openssh-client \ - openssh-server \ - lsof \ - python3.5 \ - python3-dev \ - python3-pip \ - python3-tk \ - libcupti-dev && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - # numpy 1.14.3 scipy 1.1.0 -RUN pip3 --no-cache-dir install \ - numpy==1.14.3 scipy==1.1.0 - # -#Install hadoop -# -RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ - tar xz -C /usr/local && \ - mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop - # -#Install NNI -# -RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 - ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ - HADOOP_INSTALL=/usr/local/hadoop \ - NVIDIA_VISIBLE_DEVICES=all - ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ - HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ - HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ - HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ - HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ - HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" - ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ - LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server - WORKDIR /root \ No newline at end of file +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +LABEL maintainer='Microsoft NNI Team' + +ENV HADOOP_VERSION=2.7.2 +LABEL HADOOP_VERSION=2.7.2 + +RUN DEBIAN_FRONTEND=noninteractive && \ + apt-get -y update && \ + apt-get -y install sudo \ + apt-utils \ + git \ + curl \ + vim \ + unzip \ + wget \ + build-essential \ + cmake \ + libopenblas-dev \ + automake \ + openjdk-8-jdk \ + openssh-client \ + openssh-server \ + lsof \ + python3.5 \ + python3-dev \ + python3-pip \ + python3-tk \ + libcupti-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# numpy 1.14.3 scipy 1.1.0 +RUN pip3 --no-cache-dir install \ + numpy==1.14.3 scipy==1.1.0 + +# +#Install hadoop +# +RUN wget -qO- http://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar xz -C /usr/local && \ + mv /usr/local/hadoop-${HADOOP_VERSION} /usr/local/hadoop + +# +#Install NNI +# +RUN pip3 install -v --user git+https://github.com/Microsoft/nni.git@v0.2 + +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ + HADOOP_INSTALL=/usr/local/hadoop \ + NVIDIA_VISIBLE_DEVICES=all + +ENV HADOOP_PREFIX=${HADOOP_INSTALL} \ + HADOOP_BIN_DIR=${HADOOP_INSTALL}/bin \ + HADOOP_SBIN_DIR=${HADOOP_INSTALL}/sbin \ + HADOOP_HDFS_HOME=${HADOOP_INSTALL} \ + HADOOP_COMMON_LIB_NATIVE_DIR=${HADOOP_INSTALL}/lib/native \ + HADOOP_OPTS="-Djava.library.path=${HADOOP_INSTALL}/lib/native" + +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/sbin:/bin:${HADOOP_BIN_DIR}:${HADOOP_SBIN_DIR} \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/targets/x86_64-linux/lib/stubs:${JAVA_HOME}/jre/lib/amd64/server + +WORKDIR /root diff --git a/deployment/README.md b/deployment/README.md index c9bd2e8175..19b84cba3f 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -2,7 +2,8 @@ Dockerfile === ## 1.Description This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly. -Dockerfile could build docker image, users could build their customized docker image using this file. +Dockerfile.build.base could build the base Docker image, users can get a docker image with Ubuntu and NNI environment after building this file. +Dockerfile could build the customized docker image, users could build their customized docker image using this file. ## 2.Including Libraries ``` @@ -16,5 +17,6 @@ NNI v0.1 ## 3 How to run + docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 . docker build -t nni/nni . - nvidia-docker run -it nni/nni \ No newline at end of file + nvidia-docker run -it nni/nni \ No newline at end of file From 3fb0bca2e6c3429ec1313619100cf0fbeda3738d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 17:33:27 +0800 Subject: [PATCH 26/34] update --- tools/nnicmd/launcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index d0da1031ea..519a82383e 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -49,7 +49,6 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode] if mode == 'resume': cmds += ['--experiment_id', experiment_id] - print(cmds) stdout_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stdout') stderr_full_path = os.path.join(NNICTL_HOME_DIR, config_file_name, 'stderr') stdout_file = open(stdout_full_path, 'a+') From d8dde9c4895c5266c2d94660094c40372ad780d7 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 23 Oct 2018 20:14:39 +0800 Subject: [PATCH 27/34] update --- nni | 1 + src/nni_manager/rest_server/restHandler.ts | 1 + src/nni_manager/rest_server/tensorboard.ts | 2 +- tools/nnicmd/nnictl.py | 7 +++++++ tools/nnicmd/nnictl_utils.py | 4 ++++ 5 files changed, 14 insertions(+), 1 deletion(-) create mode 160000 nni diff --git a/nni b/nni new file mode 160000 index 0000000000..ee6b149d89 --- /dev/null +++ b/nni @@ -0,0 +1 @@ +Subproject commit ee6b149d89d447e6d08c005bbc2de2a80b2b66a1 diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index cd90e149ff..29af12fb4c 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -77,6 +77,7 @@ class NNIRestHandler { this.getTriedParameters(router); this.startTensorBoard(router); this.stopTensorBoard(router); + this.getTrialLogPath(router); // Express-joi-validator configuration router.use((err: any, req: Request, res: Response, next: any) => { diff --git a/src/nni_manager/rest_server/tensorboard.ts b/src/nni_manager/rest_server/tensorboard.ts index 89873aa439..7fb2076371 100644 --- a/src/nni_manager/rest_server/tensorboard.ts +++ b/src/nni_manager/rest_server/tensorboard.ts @@ -130,7 +130,7 @@ export class TensorBoard { return logPath.split('://')[1].split(':')[0]; //TODO use url parse } - private async getLogDir(trialJobId: string): Promise { + public async getLogDir(trialJobId: string): Promise { const jobInfo: TrialJobInfo = await this.dataStore.getTrialJob(trialJobId); const logPath: string | undefined = jobInfo.logPath; if (logPath === undefined) { diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index d7fd49a046..f3f28ada40 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -148,6 +148,13 @@ def parse_args(): parser_package_show = parser_package_subparsers.add_parser('show', help='show the information of packages') parser_package_show.set_defaults(func=package_show) + #parse tensorboard command + parser_tensorboard = subparsers.add_parser('tensorboard', help='manage tensorboard') + parser_tensorboard_subparsers = parser_tensorboard.add_subparsers() + parser_tensorboard_start = parser_tensorboard_subparsers.add_parser('start', help='start tensorboard') + parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment') + parser_tensorboard_start.set_defaults(func=start_tensorboard) + args = parser.parse_args() args.func(args) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index d4d99309fd..d830dd1d43 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -343,3 +343,7 @@ def experiment_list(args): experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \ experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) + +def start_tensorboard(args): + '''start tensorboard''' + pass From 7b3906492179bc6675f30add6951bda59688502d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 24 Oct 2018 21:17:43 +0800 Subject: [PATCH 28/34] add nnictl tensorboard --- src/nni_manager/rest_server/restHandler.ts | 1 - tools/nnicmd/launcher.py | 6 +- tools/nnicmd/nnictl.py | 6 ++ tools/nnicmd/nnictl_utils.py | 10 +- tools/nnicmd/tensorboard_utils.py | 104 +++++++++++++++++++++ tools/nnicmd/url_utils.py | 13 +++ tools/nnicmd/webui_utils.py | 33 ------- 7 files changed, 131 insertions(+), 42 deletions(-) create mode 100644 tools/nnicmd/tensorboard_utils.py delete mode 100644 tools/nnicmd/webui_utils.py diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 29af12fb4c..cd90e149ff 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -77,7 +77,6 @@ class NNIRestHandler { this.getTriedParameters(router); this.startTensorBoard(router); this.stopTensorBoard(router); - this.getTrialLogPath(router); // Express-joi-validator configuration router.use((err: any, req: Request, res: Response, next: any) => { diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 519a82383e..5bbcd0e217 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -28,11 +28,10 @@ from nni_annotation import * from .launcher_utils import validate_all_content from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response -from .url_utils import cluster_metadata_url, experiment_url +from .url_utils import cluster_metadata_url, experiment_url, get_local_urls from .config_utils import Config, Experiments from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port from .constants import * -from .webui_utils import * import time import random import string @@ -288,7 +287,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen except Exception: raise Exception(ERROR_INFO % 'Restful server stopped!') exit(1) - web_ui_url_list = get_web_ui_urls(args.port, config_file_name) + web_ui_url_list = get_local_urls(args.port) + nni_config.set_config('webuiUrl', web_ui_url_list) #save experiment information experiment_config = Experiments() diff --git a/tools/nnicmd/nnictl.py b/tools/nnicmd/nnictl.py index f3f28ada40..827212e31a 100644 --- a/tools/nnicmd/nnictl.py +++ b/tools/nnicmd/nnictl.py @@ -25,6 +25,7 @@ from .nnictl_utils import * from .package_management import * from .constants import * +from .tensorboard_utils import * def nni_help_info(*args): print('please run "nnictl {positional argument} --help" to see nnictl guidance') @@ -153,7 +154,12 @@ def parse_args(): parser_tensorboard_subparsers = parser_tensorboard.add_subparsers() parser_tensorboard_start = parser_tensorboard_subparsers.add_parser('start', help='start tensorboard') parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment') + parser_tensorboard_start.add_argument('--trialid', dest='trialid', help='the id of trial') + parser_tensorboard_start.add_argument('--port', dest='port', default=6006, help='the port to start tensorboard') parser_tensorboard_start.set_defaults(func=start_tensorboard) + parser_tensorboard_start = parser_tensorboard_subparsers.add_parser('stop', help='stop tensorboard') + parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment') + parser_tensorboard_start.set_defaults(func=stop_tensorboard) args = parser.parse_args() args.func(args) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index d830dd1d43..d09945b471 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -173,9 +173,12 @@ def stop_experiment(args): #sleep to wait rest handler done time.sleep(3) rest_pid = nni_config.get_config('restServerPid') + tensorboard_pid = nni_config.get_config('tensorboardPid') if rest_pid: - cmds = ['pkill', '-P', str(rest_pid)] - call(cmds) + stop_rest_cmds = ['pkill', '-P', str(rest_pid)] + call(stop_rest_cmds) + stop_tensorboard_cmds = ['kill', '-9', str(tensorboard_pid)] + call(stop_tensorboard_cmds) if stop_rest_result: print_normal('Stop experiment success!') experiment_config.update_experiment(experiment_id, 'status', 'stopped') @@ -344,6 +347,3 @@ def experiment_list(args): experiment_dict[key]['startTime'], experiment_dict[key]['endTime'])) print(EXPERIMENT_INFORMATION_FORMAT % experiment_information) -def start_tensorboard(args): - '''start tensorboard''' - pass diff --git a/tools/nnicmd/tensorboard_utils.py b/tools/nnicmd/tensorboard_utils.py new file mode 100644 index 0000000000..1f10d96c00 --- /dev/null +++ b/tools/nnicmd/tensorboard_utils.py @@ -0,0 +1,104 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os +import psutil +import json +import datetime +import time +from subprocess import call, check_output, Popen, PIPE +from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response +from .config_utils import Config, Experiments +from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, get_local_urls +from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT +import time +from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port +from .nnictl_utils import * +import re + +def start_local_tensorboard_process(log_path, port, nni_config): + '''call cmds to start tensorboard process in local mode''' + if detect_port(port): + print_error('Port %s is used by another process, please reset port!' % str(port)) + exit(1) + temp_dir = os.environ['HOME'] + stdout_file = open(os.path.join(temp_dir, 'tensorboard_stdout'), 'a+') + stderr_file = open(os.path.join(temp_dir, 'tensorboard_stderr'), 'a+') + cmds = ['tensorboard', '--logdir', log_path, '--port', str(port)] + tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) + url_list = get_local_urls(port) + print_normal('Start tensorboard success, you can visit tensorboard from: ' + ' '.join(url_list)) + nni_config.set_config('tensorboardPid', tensorboard_process.pid) + +def stop_tensorboard(args): + '''stop tensorboard''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + config_file_name = experiment_dict[experiment_id]['fileName'] + nni_config = Config(config_file_name) + tensorboard_pid = nni_config.get_config('tensorboardPid') + cmds = ['kill', '-9', str(tensorboard_pid)] + call(cmds) + print_normal('Stop tensorboard success!') + + +def start_tensorboard(args): + '''start tensorboard''' + experiment_id = check_experiment_id(args) + experiment_config = Experiments() + experiment_dict = experiment_config.get_all_experiments() + config_file_name = experiment_dict[experiment_id]['fileName'] + nni_config = Config(config_file_name) + rest_port = nni_config.get_config('restServerPort') + rest_pid = nni_config.get_config('restServerPid') + if not detect_process(rest_pid): + print_error('Experiment is not running...') + return + running, response = check_rest_server_quick(rest_port) + trial_content = None + if running: + response = rest_get(trial_jobs_url(rest_port), 20) + if response and check_response(response): + trial_content = json.loads(response.text) + else: + print_error('List trial failed...') + else: + print_error('Restful server is not running...') + if not trial_content: + print_error('No trial information!') + exit(1) + path_list = [] + if args.trialid is None: + for trial in trial_content: + pattern = r'(?P.+)://(?P.+):(?P.*)' + match = re.search(pattern,trial['logPath']) + if match: + path_list.append(match.group('path')) + else: + for trial in trial_content: + if trial.get(args.trialid): + path_list.append(trial['logPath']) + break + if not path_list: + print_error('Trial id %s error!' % args.trialid) + exit(1) + + start_local_tensorboard_process(':'.join(path_list), args.port, nni_config) \ No newline at end of file diff --git a/tools/nnicmd/url_utils.py b/tools/nnicmd/url_utils.py index f47463cb06..2735baf686 100644 --- a/tools/nnicmd/url_utils.py +++ b/tools/nnicmd/url_utils.py @@ -18,6 +18,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +import psutil +from socket import AddressFamily BASE_URL = 'http://localhost' @@ -53,6 +55,7 @@ def trial_jobs_url(port): '''get trial_jobs url''' return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, TRIAL_JOBS_API) + def trial_job_id_url(port, job_id): '''get trial_jobs with id url''' return '{0}:{1}{2}{3}/:{4}'.format(BASE_URL, port, API_ROOT_URL, TRIAL_JOBS_API, job_id) @@ -61,3 +64,13 @@ def trial_job_id_url(port, job_id): def tensorboard_url(port): '''get tensorboard url''' return '{0}:{1}{2}{3}'.format(BASE_URL, port, API_ROOT_URL, TENSORBOARD_API) + + +def get_local_urls(port): + '''get urls of local machine''' + url_list = [] + for name, info in psutil.net_if_addrs().items(): + for addr in info: + if AddressFamily.AF_INET == addr.family: + url_list.append('http://{}:{}'.format(addr.address, port)) + return url_list \ No newline at end of file diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py deleted file mode 100644 index 69c374aebd..0000000000 --- a/tools/nnicmd/webui_utils.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -import psutil -from socket import AddressFamily -from .config_utils import Config - -def get_web_ui_urls(port, CONFIG_FILE_NAME): - webui_url_list = [] - for name, info in psutil.net_if_addrs().items(): - for addr in info: - if AddressFamily.AF_INET == addr.family: - webui_url_list.append('http://{}:{}'.format(addr.address, port)) - nni_config = Config(CONFIG_FILE_NAME) - nni_config.set_config('webuiUrl', webui_url_list) - return webui_url_list From d6e5009321f1592b09042bf0aea1dfbf8fa6e38d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 24 Oct 2018 21:34:04 +0800 Subject: [PATCH 29/34] update --- nni | 1 - src/nni_manager/rest_server/tensorboard.ts | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 160000 nni diff --git a/nni b/nni deleted file mode 160000 index ee6b149d89..0000000000 --- a/nni +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ee6b149d89d447e6d08c005bbc2de2a80b2b66a1 diff --git a/src/nni_manager/rest_server/tensorboard.ts b/src/nni_manager/rest_server/tensorboard.ts index 7fb2076371..89873aa439 100644 --- a/src/nni_manager/rest_server/tensorboard.ts +++ b/src/nni_manager/rest_server/tensorboard.ts @@ -130,7 +130,7 @@ export class TensorBoard { return logPath.split('://')[1].split(':')[0]; //TODO use url parse } - public async getLogDir(trialJobId: string): Promise { + private async getLogDir(trialJobId: string): Promise { const jobInfo: TrialJobInfo = await this.dataStore.getTrialJob(trialJobId); const logPath: string | undefined = jobInfo.logPath; if (logPath === undefined) { From 3eeef2569ba99116a33bb3d003ba211d009ffdfa Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 25 Oct 2018 16:58:15 +0800 Subject: [PATCH 30/34] add remote mode --- setup.py | 3 +- tools/nnicmd/ssh_utils.py | 49 ++++++++++++++++++ tools/nnicmd/tensorboard_utils.py | 85 ++++++++++++++++++++++--------- tools/setup.py | 3 +- 4 files changed, 113 insertions(+), 27 deletions(-) create mode 100644 tools/nnicmd/ssh_utils.py diff --git a/setup.py b/setup.py index ea38f80667..1860a58869 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,8 @@ def run(self): 'requests', 'scipy', 'schema', - 'pyhdfs' + 'pyhdfs', + 'paramiko' ], cmdclass={ diff --git a/tools/nnicmd/ssh_utils.py b/tools/nnicmd/ssh_utils.py new file mode 100644 index 0000000000..befd25deb3 --- /dev/null +++ b/tools/nnicmd/ssh_utils.py @@ -0,0 +1,49 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import paramiko +import os +from .common_utils import print_error + +def copy_remote_directory_to_local(sftp, remote_path, local_path): + '''copy remote directory to local machine''' + try: + os.makedirs(local_path, exist_ok=True) + files = sftp.listdir(remote_path) + for file in files: + remote_full_path = os.path.join(remote_path, file) + local_full_path = os.path.join(local_path, file) + try: + if sftp.listdir(remote_full_path): + copy_remote_directory_to_local(sftp, remote_full_path, local_full_path) + except: + sftp.get(remote_full_path, local_full_path) + except Exception: + pass + +def create_ssh_sftp_client(host_ip, port, username, password): + '''create ssh client''' + try: + conn = paramiko.Transport(host_ip, port) + conn.connect(username=username, password=password) + sftp = paramiko.SFTPClient.from_transport(conn) + return sftp + except Exception as exception: + print_error('Create ssh client error %s\n' % exception) \ No newline at end of file diff --git a/tools/nnicmd/tensorboard_utils.py b/tools/nnicmd/tensorboard_utils.py index 1f10d96c00..37f34d4508 100644 --- a/tools/nnicmd/tensorboard_utils.py +++ b/tools/nnicmd/tensorboard_utils.py @@ -32,18 +32,63 @@ from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port from .nnictl_utils import * import re +from .ssh_utils import create_ssh_sftp_client, copy_remote_directory_to_local +import tempfile -def start_local_tensorboard_process(log_path, port, nni_config): - '''call cmds to start tensorboard process in local mode''' - if detect_port(port): - print_error('Port %s is used by another process, please reset port!' % str(port)) +def parse_log_path(args, trial_content): + '''parse log path''' + path_list = [] + host_list = [] + for trial in trial_content: + if args.trialid and trial.get(args.trialid) is None: + continue + pattern = r'(?P.+)://(?P.+):(?P.*)' + match = re.search(pattern,trial['logPath']) + if match: + path_list.append(match.group('path')) + host_list.append(match.group('host')) + if not path_list: + print_error('Trial id %s error!' % args.trialid) + exit(1) + return path_list, host_list + +def copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path): + '''use ssh client to copy data from remote machine to local machien''' + machine_list = nni_config.get_config('experimentConfig').get('machineList') + machine_dict = {} + local_path_list = [] + for machine in machine_list: + machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username']} + for index, host in enumerate(host_list): + local_path = os.path.join(temp_nni_path, trial_content[index].get('id')) + local_path_list.append(local_path) + sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd']) + copy_remote_directory_to_local(sftp, path_list[index], local_path) + return local_path_list + +def get_path_list(args, nni_config, trial_content, temp_nni_path): + '''get path list according to different platform''' + path_list, host_list = parse_log_path(args, trial_content) + platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform') + if platform == 'local': + return path_list + elif platform == 'remote': + return copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path) + else: + print_error('Not supported platform!') + exit(1) + +def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): + '''call cmds to start tensorboard process in local machine''' + if detect_port(args.port): + print_error('Port %s is used by another process, please reset port!' % str(args.port)) exit(1) - temp_dir = os.environ['HOME'] - stdout_file = open(os.path.join(temp_dir, 'tensorboard_stdout'), 'a+') - stderr_file = open(os.path.join(temp_dir, 'tensorboard_stderr'), 'a+') - cmds = ['tensorboard', '--logdir', log_path, '--port', str(port)] + + stdout_file = open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') + stderr_file = open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') + cmds = ['tensorboard', '--logdir', ':'.join(path_list), '--port', str(args.port)] tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) - url_list = get_local_urls(port) + url_list = get_local_urls(args.port) print_normal('Start tensorboard success, you can visit tensorboard from: ' + ' '.join(url_list)) nni_config.set_config('tensorboardPid', tensorboard_process.pid) @@ -85,20 +130,10 @@ def start_tensorboard(args): if not trial_content: print_error('No trial information!') exit(1) - path_list = [] - if args.trialid is None: - for trial in trial_content: - pattern = r'(?P.+)://(?P.+):(?P.*)' - match = re.search(pattern,trial['logPath']) - if match: - path_list.append(match.group('path')) - else: - for trial in trial_content: - if trial.get(args.trialid): - path_list.append(trial['logPath']) - break - if not path_list: - print_error('Trial id %s error!' % args.trialid) - exit(1) - start_local_tensorboard_process(':'.join(path_list), args.port, nni_config) \ No newline at end of file + experiment_id = nni_config.get_config('experimentId') + temp_nni_path = os.path.join(tempfile.gettempdir(), 'nni', experiment_id) + os.makedirs(temp_nni_path, exist_ok=True) + + path_list = get_path_list(args, nni_config, trial_content, temp_nni_path) + start_tensorboard_process(args, nni_config, path_list, temp_nni_path) diff --git a/tools/setup.py b/tools/setup.py index 7b368f4267..5605926b5f 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -12,7 +12,8 @@ 'psutil', 'astor', 'schema', - 'pyhdfs' + 'pyhdfs', + 'paramiko' ], author = 'Microsoft NNI Team', From 46aa1a502147b83a62c0b6190434497db624c18d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 25 Oct 2018 17:36:22 +0800 Subject: [PATCH 31/34] add tips --- tools/nnicmd/tensorboard_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/nnicmd/tensorboard_utils.py b/tools/nnicmd/tensorboard_utils.py index 37f34d4508..c786253ae4 100644 --- a/tools/nnicmd/tensorboard_utils.py +++ b/tools/nnicmd/tensorboard_utils.py @@ -60,6 +60,7 @@ def copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, for machine in machine_list: machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username']} for index, host in enumerate(host_list): + print_normal('Copying log data from %s ...' % host) local_path = os.path.join(temp_nni_path, trial_content[index].get('id')) local_path_list.append(local_path) sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd']) From 09938260975d6a359228c99ef1fd47a96594c132 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 26 Oct 2018 16:33:01 +0800 Subject: [PATCH 32/34] update tensorboard --- docs/NNICTLDOC.md | 42 ++++++++++++++++++++++++++++ tools/nnicmd/nnictl_utils.py | 9 ++++-- tools/nnicmd/tensorboard_utils.py | 46 +++++++++++++++++++++++-------- 3 files changed, 82 insertions(+), 15 deletions(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 705bbc1ef5..5eb536f411 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -282,6 +282,48 @@ nnictl webui Options: + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| + + +### Manage tensorboard +* __nnictl tensorboard start__ + * Description + + Start the tensorboard process. + + * Usage + + nnictl tensorboard start + + Options: + + | Name, shorthand | Required|Default | Description | + | ------ | ------ | ------ |------ | + | id| False| |ID of the experiment you want to set| + | --trialid| False| |ID of the trial| + | --port| False| 6006|The port of the tensorboard process| + + * Detail + + 1. NNICTL support tensorboard function in local and remote platform for the moment, other platforms will be supported later. + 2. If you want to use tensorboard, you need to write your tensorboard log data to environment variable [NNI_OUTPUT_DIR] path. + 3. In local mode, nnictl will set --logdir=[NNI_OUTPUT_DIR] directly and start a tensorboard process. + 4. In remote mode, nnictl will create a ssh client to copy log data from remote machine to local temp directory firstly, and then start a tensorboard process in your local machine. You need to notice that nnictl only copy the log data one time when you use the command, if you want to see the later result of tensorboard, you should execute nnictl tensorboard command again. + 5. If there are multiple trial jobs running, you should set the trialid, or you could use [nnictl tensorboard start --trialid all] to map --logdir to all trial log paths. + +* __nnictl tensorboard stop__ + * Description + + Stop all of the tensorboard process. + + * Usage + + nnictl tensorboard stop + + Options: + | Name, shorthand | Required|Default | Description | | ------ | ------ | ------ |------ | | id| False| |ID of the experiment you want to set| \ No newline at end of file diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index d09945b471..02fe312106 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -173,12 +173,15 @@ def stop_experiment(args): #sleep to wait rest handler done time.sleep(3) rest_pid = nni_config.get_config('restServerPid') - tensorboard_pid = nni_config.get_config('tensorboardPid') if rest_pid: stop_rest_cmds = ['pkill', '-P', str(rest_pid)] call(stop_rest_cmds) - stop_tensorboard_cmds = ['kill', '-9', str(tensorboard_pid)] - call(stop_tensorboard_cmds) + tensorboard_pid_list = nni_config.get_config('tensorboardPidList') + if tensorboard_pid_list: + for tensorboard_pid in tensorboard_pid_list: + cmds = ['kill', '-9', str(tensorboard_pid)] + call(cmds) + nni_config.set_config('tensorboardPidList', []) if stop_rest_result: print_normal('Stop experiment success!') experiment_config.update_experiment(experiment_id, 'status', 'stopped') diff --git a/tools/nnicmd/tensorboard_utils.py b/tools/nnicmd/tensorboard_utils.py index c786253ae4..f4f4c0dc82 100644 --- a/tools/nnicmd/tensorboard_utils.py +++ b/tools/nnicmd/tensorboard_utils.py @@ -27,7 +27,7 @@ from .rest_utils import rest_get, rest_delete, check_rest_server_quick, check_response from .config_utils import Config, Experiments from .url_utils import trial_jobs_url, experiment_url, trial_job_id_url, get_local_urls -from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT +from .constants import NNICTL_HOME_DIR, EXPERIMENT_INFORMATION_FORMAT, EXPERIMENT_DETAIL_FORMAT, COLOR_GREEN_FORMAT import time from .common_utils import print_normal, print_error, print_warning, detect_process, detect_port from .nnictl_utils import * @@ -40,7 +40,7 @@ def parse_log_path(args, trial_content): path_list = [] host_list = [] for trial in trial_content: - if args.trialid and trial.get(args.trialid) is None: + if args.trialid and args.trialid != 'all' and trial.get('id') != args.trialid: continue pattern = r'(?P.+)://(?P.+):(?P.*)' match = re.search(pattern,trial['logPath']) @@ -60,11 +60,12 @@ def copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, for machine in machine_list: machine_dict[machine['ip']] = {'port': machine['port'], 'passwd': machine['passwd'], 'username': machine['username']} for index, host in enumerate(host_list): - print_normal('Copying log data from %s ...' % host) local_path = os.path.join(temp_nni_path, trial_content[index].get('id')) local_path_list.append(local_path) + print_normal('Copying log data from %s to %s' % (host + ':' + path_list[index], local_path)) sftp = create_ssh_sftp_client(host, machine_dict[host]['port'], machine_dict[host]['username'], machine_dict[host]['passwd']) copy_remote_directory_to_local(sftp, path_list[index], local_path) + print_normal('Copy done!') return local_path_list def get_path_list(args, nni_config, trial_content, temp_nni_path): @@ -72,13 +73,22 @@ def get_path_list(args, nni_config, trial_content, temp_nni_path): path_list, host_list = parse_log_path(args, trial_content) platform = nni_config.get_config('experimentConfig').get('trainingServicePlatform') if platform == 'local': + print_normal('Log path: %s' % ' '.join(path_list)) return path_list elif platform == 'remote': - return copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path) + path_list = copy_data_from_remote(args, nni_config, trial_content, path_list, host_list, temp_nni_path) + print_normal('Log path: %s' % ' '.join(path_list)) + return path_list else: print_error('Not supported platform!') exit(1) +def format_tensorboard_log_path(path_list): + new_path_list = [] + for index, value in enumerate(path_list): + new_path_list.append('name%d:%s' % (index + 1, value)) + return ','.join(new_path_list) + def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): '''call cmds to start tensorboard process in local machine''' if detect_port(args.port): @@ -87,11 +97,16 @@ def start_tensorboard_process(args, nni_config, path_list, temp_nni_path): stdout_file = open(os.path.join(temp_nni_path, 'tensorboard_stdout'), 'a+') stderr_file = open(os.path.join(temp_nni_path, 'tensorboard_stderr'), 'a+') - cmds = ['tensorboard', '--logdir', ':'.join(path_list), '--port', str(args.port)] + cmds = ['tensorboard', '--logdir', format_tensorboard_log_path(path_list), '--port', str(args.port)] tensorboard_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) url_list = get_local_urls(args.port) - print_normal('Start tensorboard success, you can visit tensorboard from: ' + ' '.join(url_list)) - nni_config.set_config('tensorboardPid', tensorboard_process.pid) + print_normal(COLOR_GREEN_FORMAT % 'Start tensorboard success!\n' + 'Tensorboard urls: ' + ' '.join(url_list)) + tensorboard_process_pid_list = nni_config.get_config('tensorboardPidList') + if tensorboard_process_pid_list is None: + tensorboard_process_pid_list = [tensorboard_process.pid] + else: + tensorboard_process_pid_list.append(tensorboard_process.pid) + nni_config.set_config('tensorboardPidList', tensorboard_process_pid_list) def stop_tensorboard(args): '''stop tensorboard''' @@ -100,10 +115,15 @@ def stop_tensorboard(args): experiment_dict = experiment_config.get_all_experiments() config_file_name = experiment_dict[experiment_id]['fileName'] nni_config = Config(config_file_name) - tensorboard_pid = nni_config.get_config('tensorboardPid') - cmds = ['kill', '-9', str(tensorboard_pid)] - call(cmds) - print_normal('Stop tensorboard success!') + tensorboard_pid_list = nni_config.get_config('tensorboardPidList') + if tensorboard_pid_list: + for tensorboard_pid in tensorboard_pid_list: + cmds = ['kill', '-9', str(tensorboard_pid)] + call(cmds) + nni_config.set_config('tensorboardPidList', []) + print_normal('Stop tensorboard success!') + else: + print_error('No tensorboard configuration!') def start_tensorboard(args): @@ -131,7 +151,9 @@ def start_tensorboard(args): if not trial_content: print_error('No trial information!') exit(1) - + if len(trial_content) > 1 and not args.trialid: + print_error('There are multiple trials, please set trial id!') + exit(1) experiment_id = nni_config.get_config('experimentId') temp_nni_path = os.path.join(tempfile.gettempdir(), 'nni', experiment_id) os.makedirs(temp_nni_path, exist_ok=True) From 41cc1476ad5363fc299d2adcfc1f0ceaae587b7b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 26 Oct 2018 16:34:19 +0800 Subject: [PATCH 33/34] update --- docs/NNICTLDOC.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/NNICTLDOC.md b/docs/NNICTLDOC.md index 5eb536f411..bbdbd3aac3 100644 --- a/docs/NNICTLDOC.md +++ b/docs/NNICTLDOC.md @@ -311,7 +311,7 @@ nnictl webui 2. If you want to use tensorboard, you need to write your tensorboard log data to environment variable [NNI_OUTPUT_DIR] path. 3. In local mode, nnictl will set --logdir=[NNI_OUTPUT_DIR] directly and start a tensorboard process. 4. In remote mode, nnictl will create a ssh client to copy log data from remote machine to local temp directory firstly, and then start a tensorboard process in your local machine. You need to notice that nnictl only copy the log data one time when you use the command, if you want to see the later result of tensorboard, you should execute nnictl tensorboard command again. - 5. If there are multiple trial jobs running, you should set the trialid, or you could use [nnictl tensorboard start --trialid all] to map --logdir to all trial log paths. + 5. If there is only one trial job, you don't need to set trialid. If there are multiple trial jobs running, you should set the trialid, or you could use [nnictl tensorboard start --trialid all] to map --logdir to all trial log paths. * __nnictl tensorboard stop__ * Description From aa1a705a6e304d9106a176eff519b2e9839aed98 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Fri, 26 Oct 2018 16:54:40 +0800 Subject: [PATCH 34/34] update --- tools/nnicmd/nnictl_utils.py | 7 +++++-- tools/nnicmd/tensorboard_utils.py | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/nnicmd/nnictl_utils.py b/tools/nnicmd/nnictl_utils.py index 02fe312106..40a3af8284 100644 --- a/tools/nnicmd/nnictl_utils.py +++ b/tools/nnicmd/nnictl_utils.py @@ -179,8 +179,11 @@ def stop_experiment(args): tensorboard_pid_list = nni_config.get_config('tensorboardPidList') if tensorboard_pid_list: for tensorboard_pid in tensorboard_pid_list: - cmds = ['kill', '-9', str(tensorboard_pid)] - call(cmds) + try: + cmds = ['kill', '-9', str(tensorboard_pid)] + call(cmds) + except Exception as exception: + print_error(exception) nni_config.set_config('tensorboardPidList', []) if stop_rest_result: print_normal('Stop experiment success!') diff --git a/tools/nnicmd/tensorboard_utils.py b/tools/nnicmd/tensorboard_utils.py index f4f4c0dc82..ba645b544c 100644 --- a/tools/nnicmd/tensorboard_utils.py +++ b/tools/nnicmd/tensorboard_utils.py @@ -118,8 +118,11 @@ def stop_tensorboard(args): tensorboard_pid_list = nni_config.get_config('tensorboardPidList') if tensorboard_pid_list: for tensorboard_pid in tensorboard_pid_list: - cmds = ['kill', '-9', str(tensorboard_pid)] - call(cmds) + try: + cmds = ['kill', '-9', str(tensorboard_pid)] + call(cmds) + except Exception as exception: + print_error(exception) nni_config.set_config('tensorboardPidList', []) print_normal('Stop tensorboard success!') else: