From 7be29e85561ce973f25236edc5073b6586e67942 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Fri, 8 May 2020 13:00:24 +0800 Subject: [PATCH 1/8] Add log content for PAIYarnTrainingService (#2409) --- .../training_service/pai/paiYarn/paiYarnTrainingService.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 80699d0b0e..13bcdfd20f 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -283,6 +283,9 @@ class PAIYarnTrainingService extends PAITrainingService { }; request(submitJobRequest, (error: Error, response: request.Response, _body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`; + this.log.error(errorMessage); trialJobDetail.status = 'FAILED'; deferred.resolve(true); } else { From f40242a7de80b52dc12d4751d4e51c95e4e28040 Mon Sep 17 00:00:00 2001 From: Sean Takafuji Date: Fri, 8 May 2020 01:13:41 -0700 Subject: [PATCH 2/8] fix documentation for FrameworkController (#2407) --- docs/en_US/TrainingService/FrameworkControllerMode.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/FrameworkControllerMode.md b/docs/en_US/TrainingService/FrameworkControllerMode.md index b33d2f6d2e..dfa4260eea 100644 --- a/docs/en_US/TrainingService/FrameworkControllerMode.md +++ b/docs/en_US/TrainingService/FrameworkControllerMode.md @@ -26,7 +26,7 @@ NNI supports running experiment using [FrameworkController](https://github.com/M ## Setup FrameworkController -Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up FrameworkController in the Kubernetes cluster, NNI supports FrameworkController by the stateful set mode. +Follow the [guideline](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run) to set up FrameworkController in the Kubernetes cluster, NNI supports FrameworkController by the stateful set mode. If your cluster enforces authorization, you need to create a service account with granted permission for FrameworkController, and then pass the name of the FrameworkController service account to the NNI Experiment Config. [refer](https://github.com/Microsoft/frameworkcontroller/tree/master/example/run#run-by-kubernetes-statefulset) ## Design @@ -83,6 +83,7 @@ If you use Azure Kubernetes Service, you should set `frameworkcontrollerConfig` ```yaml frameworkcontrollerConfig: storage: azureStorage + serviceAccountName: {your_frameworkcontroller_service_account_name} keyVault: vaultName: {your_vault_name} name: {your_secert_name} From ef2069132cd7a9b0dd44aab2b93f18b470ac2eeb Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Fri, 8 May 2020 17:23:24 +0800 Subject: [PATCH 3/8] Fix SPOS state dict (#2375) --- examples/nas/spos/network.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/nas/spos/network.py b/examples/nas/spos/network.py index 9083321641..b365156c5a 100644 --- a/examples/nas/spos/network.py +++ b/examples/nas/spos/network.py @@ -147,8 +147,10 @@ def _initialize_weights(self): def load_and_parse_state_dict(filepath="./data/checkpoint-150000.pth.tar"): checkpoint = torch.load(filepath, map_location=torch.device("cpu")) + if "state_dict" in checkpoint: + checkpoint = checkpoint["state_dict"] result = dict() - for k, v in checkpoint["state_dict"].items(): + for k, v in checkpoint.items(): if k.startswith("module."): k = k[len("module."):] result[k] = v From 31a247b7a9f44b1fbb71c5a35c10fb71b8bcca5f Mon Sep 17 00:00:00 2001 From: chicm-ms <38930155+chicm-ms@users.noreply.github.com> Date: Fri, 8 May 2020 18:21:22 +0800 Subject: [PATCH 4/8] Fix random tuner document (#2346) --- docs/en_US/Tuner/BuiltinTuner.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/en_US/Tuner/BuiltinTuner.md b/docs/en_US/Tuner/BuiltinTuner.md index 7b035d7ca7..2cff5cded3 100644 --- a/docs/en_US/Tuner/BuiltinTuner.md +++ b/docs/en_US/Tuner/BuiltinTuner.md @@ -68,10 +68,6 @@ tuner: Random search is suggested when each trial does not take very long (e.g., each trial can be completed very quickly, or early stopped by the assessor), and you have enough computational resources. It's also useful if you want to uniformly explore the search space. Random Search can be considered a baseline search algorithm. [Detailed Description](./HyperoptTuner.md) -**classArgs Requirements:** - -* **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', the tuner will try to maximize metrics. If 'minimize', the tuner will try to minimize metrics. - **Example Configuration:** ```yaml From 7e35d32e2987493838779826155f7434bc30b81c Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Sun, 10 May 2020 17:52:21 +0800 Subject: [PATCH 5/8] Remove multiphase from WebUI (#2396) As requirement of #2390 , removed multiphase from WebUI, document, and put all trials flatten on WebUI. 1. The major change is to split trial-jobs to trials in webui, and use parameter id as part of trial id. 2. If multiphase is enabled, the limitation of trial controls job count only, not trials actually. 3. When multiphase enabled, the trial status may not be right. Previous trials in a job will be marked as success, only last trial presents the job's status. 4. multiphase related documents and UX are removed. minor changes, 1. Update dev document for webui development. --- docs/en_US/AdvancedFeature/MultiPhase.md | 87 --------------- docs/en_US/Release.md | 7 +- docs/en_US/Tuner/CustomizeTuner.md | 4 +- docs/en_US/Tutorial/ExperimentConfig.md | 13 --- .../Tutorial/SetupNniDeveloperEnvironment.md | 6 +- docs/en_US/hpo_advanced.rst | 1 - docs/zh_CN/AdvancedFeature/MultiPhase.md | 88 --------------- .../components/Modals/ExperimentDrawer.tsx | 17 +-- src/webui/src/components/Modals/Killjob.tsx | 4 +- .../src/components/public-child/OpenRow.tsx | 20 ---- .../src/components/trial-detail/TableList.tsx | 54 +++++---- src/webui/src/static/const.ts | 9 +- src/webui/src/static/interface.ts | 8 +- src/webui/src/static/model/experiment.ts | 4 - src/webui/src/static/model/trial.ts | 12 +- src/webui/src/static/model/trialmanager.ts | 104 +++++++++++++++--- 16 files changed, 148 insertions(+), 290 deletions(-) delete mode 100644 docs/en_US/AdvancedFeature/MultiPhase.md delete mode 100644 docs/zh_CN/AdvancedFeature/MultiPhase.md diff --git a/docs/en_US/AdvancedFeature/MultiPhase.md b/docs/en_US/AdvancedFeature/MultiPhase.md deleted file mode 100644 index a2f62b700b..0000000000 --- a/docs/en_US/AdvancedFeature/MultiPhase.md +++ /dev/null @@ -1,87 +0,0 @@ -# Multi-phase - -## What is multi-phase experiment - -Typically each trial job gets a single configuration (e.g., hyperparameters) from tuner, tries this configuration and reports result, then exits. But sometimes a trial job may wants to request multiple configurations from tuner. We find this is a very compelling feature. For example: - -1. Job launch takes tens of seconds in some training platform. If a configuration takes only around a minute to finish, running only one configuration in a trial job would be very inefficient. An appealing way is that a trial job requests a configuration and finishes it, then requests another configuration and run. The extreme case is that a trial job can run infinite configurations. If you set concurrency to be for example 6, there would be 6 __long running__ jobs keeping trying different configurations. - -2. Some types of models have to be trained phase by phase, the configuration of next phase depends on the results of previous phase(s). For example, to find the best quantization for a model, the training procedure is often as follows: the auto-quantization algorithm (i.e., tuner in NNI) chooses a size of bits (e.g., 16 bits), a trial job gets this configuration and trains the model for some epochs and reports result (e.g., accuracy). The algorithm receives this result and makes decision of changing 16 bits to 8 bits, or changing back to 32 bits. This process is repeated for a configured times. - -The above cases can be supported by the same feature, i.e., multi-phase execution. To support those cases, basically a trial job should be able to request multiple configurations from tuner. Tuner is aware of whether two configuration requests are from the same trial job or different ones. Also in multi-phase a trial job can report multiple final results. - -## Create multi-phase experiment - -### Write trial code which leverages multi-phase: - -__1. Update trial code__ - -It is pretty simple to use multi-phase in trial code, an example is shown below: - -```python -# ... -for i in range(5): - # get parameter from tuner - tuner_param = nni.get_next_parameter() - # nni.get_next_parameter returns None if there is no more hyper parameters can be generated by tuner. - if tuner_param is None: - break - - # consume the params - # ... - # report final result somewhere for the parameter retrieved above - nni.report_final_result() - # ... -# ... -``` - -In multi-phase experiments, at each time the API `nni.get_next_parameter()` is called, it returns a new hyper parameter generated by tuner, then the trail code consume this new hyper parameter and report final result of this hyper parameter. `nni.get_next_parameter()` and `nni.report_final_result()` should be called sequentially: __call the former one, then call the later one; and repeat this pattern__. If `nni.get_next_parameter()` is called multiple times consecutively, and then `nni.report_final_result()` is called once, the result is associated to the last configuration, which is retrieved from the last get_next_parameter call. So there is no result associated to previous get_next_parameter calls, and it may cause some multi-phase algorithm broken. - -Note that, `nni.get_next_parameter` returns None if there is no more hyper parameters can be generated by tuner. - -__2. Experiment configuration__ - -To enable multi-phase, you should also add `multiPhase: true` in your experiment YAML configure file. If this line is not added, `nni.get_next_parameter()` would always return the same configuration. - -Multi-phase experiment configuration example: - -```yaml -authorName: default -experimentName: multiphase experiment -trialConcurrency: 2 -maxExecDuration: 1h -maxTrialNum: 8 -trainingServicePlatform: local -searchSpacePath: search_space.json -multiPhase: true -useAnnotation: false -tuner: - builtinTunerName: TPE - classArgs: - optimize_mode: maximize -trial: - command: python3 mytrial.py - codeDir: . - gpuNum: 0 -``` - -### Write a tuner that leverages multi-phase: - -Before writing a multi-phase tuner, we highly suggest you to go through [Customize Tuner](https://nni.readthedocs.io/en/latest/Tuner/CustomizeTuner.html). Same as writing a normal tuner, your tuner needs to inherit from `Tuner` class. When you enable multi-phase through configuration (set `multiPhase` to true), your tuner will get an additional parameter `trial_job_id` via tuner's following methods: - -```text -generate_parameters -generate_multiple_parameters -receive_trial_result -receive_customized_trial_result -trial_end -``` - -With this information, the tuner could know which trial is requesting a configuration, and which trial is reporting results. This information provides enough flexibility for your tuner to deal with different trials and different phases. For example, you may want to use the trial_job_id parameter of generate_parameters method to generate hyperparameters for a specific trial job. - -### Tuners support multi-phase experiments: - -[TPE](../Tuner/HyperoptTuner.md), [Random](../Tuner/HyperoptTuner.md), [Anneal](../Tuner/HyperoptTuner.md), [Evolution](../Tuner/EvolutionTuner.md), [SMAC](../Tuner/SmacTuner.md), [NetworkMorphism](../Tuner/NetworkmorphismTuner.md), [MetisTuner](../Tuner/MetisTuner.md), [BOHB](../Tuner/BohbAdvisor.md), [Hyperband](../Tuner/HyperbandAdvisor.md). - -### Training services support multi-phase experiment: -[Local Machine](../TrainingService/LocalMode.md), [Remote Servers](../TrainingService/RemoteMachineMode.md), [OpenPAI](../TrainingService/PaiMode.md) diff --git a/docs/en_US/Release.md b/docs/en_US/Release.md index 8d44d8616b..ab7ed18ff1 100644 --- a/docs/en_US/Release.md +++ b/docs/en_US/Release.md @@ -206,7 +206,7 @@ * Documentation - Update the docs structure -Issue #1231 - - [Multi phase document improvement](AdvancedFeature/MultiPhase.md) -Issue #1233 -PR #1242 + - (deprecated) Multi phase document improvement -Issue #1233 -PR #1242 + Add configuration example - [WebUI description improvement](Tutorial/WebUI.md) -PR #1419 @@ -234,12 +234,10 @@ * Add `enas-mode` and `oneshot-mode` for NAS interface: [PR #1201](https://github.com/microsoft/nni/pull/1201#issue-291094510) * [Gaussian Process Tuner with Matern kernel](Tuner/GPTuner.md) -* Multiphase experiment supports +* (deprecated) Multiphase experiment supports * Added new training service support for multiphase experiment: PAI mode supports multiphase experiment since v0.9. * Added multiphase capability for the following builtin tuners: * TPE, Random Search, Anneal, Naïve Evolution, SMAC, Network Morphism, Metis Tuner. - - For details, please refer to [Write a tuner that leverages multi-phase](AdvancedFeature/MultiPhase.md) * Web Portal * Enable trial comparation in Web Portal. For details, refer to [View trials status](Tutorial/WebUI.md) @@ -549,4 +547,3 @@ Initial release of Neural Network Intelligence (NNI). * Support CI by providing out-of-box integration with [travis-ci](https://github.com/travis-ci) on ubuntu * Others * Support simple GPU job scheduling - diff --git a/docs/en_US/Tuner/CustomizeTuner.md b/docs/en_US/Tuner/CustomizeTuner.md index b0788e0788..0e7b40d533 100644 --- a/docs/en_US/Tuner/CustomizeTuner.md +++ b/docs/en_US/Tuner/CustomizeTuner.md @@ -51,7 +51,7 @@ class CustomizedTuner(Tuner): ... ``` -`receive_trial_result` will receive the `parameter_id, parameters, value` as parameters input. Also, Tuner will receive the `value` object are exactly same value that Trial send. If `multiPhase` is set to `true` in the experiment configuration file, an additional `trial_job_id` parameter is passed to `receive_trial_result` and `generate_parameters` through the `**kwargs` parameter. +`receive_trial_result` will receive the `parameter_id, parameters, value` as parameters input. Also, Tuner will receive the `value` object are exactly same value that Trial send. The `your_parameters` return from `generate_parameters` function, will be package as json object by NNI SDK. NNI SDK will unpack json object so the Trial will receive the exact same `your_parameters` from Tuner. @@ -109,4 +109,4 @@ More detail example you could see: ### Write a more advanced automl algorithm -The methods above are usually enough to write a general tuner. However, users may also want more methods, for example, intermediate results, trials' state (e.g., the methods in assessor), in order to have a more powerful automl algorithm. Therefore, we have another concept called `advisor` which directly inherits from `MsgDispatcherBase` in [`src/sdk/pynni/nni/msg_dispatcher_base.py`](https://github.com/Microsoft/nni/tree/master/src/sdk/pynni/nni/msg_dispatcher_base.py). Please refer to [here](CustomizeAdvisor.md) for how to write a customized advisor. \ No newline at end of file +The methods above are usually enough to write a general tuner. However, users may also want more methods, for example, intermediate results, trials' state (e.g., the methods in assessor), in order to have a more powerful automl algorithm. Therefore, we have another concept called `advisor` which directly inherits from `MsgDispatcherBase` in [`src/sdk/pynni/nni/msg_dispatcher_base.py`](https://github.com/Microsoft/nni/tree/master/src/sdk/pynni/nni/msg_dispatcher_base.py). Please refer to [here](CustomizeAdvisor.md) for how to write a customized advisor. diff --git a/docs/en_US/Tutorial/ExperimentConfig.md b/docs/en_US/Tutorial/ExperimentConfig.md index 0a3cdca905..6bcd2e53d2 100644 --- a/docs/en_US/Tutorial/ExperimentConfig.md +++ b/docs/en_US/Tutorial/ExperimentConfig.md @@ -17,7 +17,6 @@ This document describes the rules to write the config file, and provides some ex + [trainingServicePlatform](#trainingserviceplatform) + [searchSpacePath](#searchspacepath) + [useAnnotation](#useannotation) - + [multiPhase](#multiphase) + [multiThread](#multithread) + [nniManagerIp](#nnimanagerip) + [logDir](#logdir) @@ -94,8 +93,6 @@ searchSpacePath: #choice: true, false, default: false useAnnotation: #choice: true, false, default: false -multiPhase: -#choice: true, false, default: false multiThread: tuner: #choice: TPE, Random, Anneal, Evolution @@ -130,8 +127,6 @@ searchSpacePath: #choice: true, false, default: false useAnnotation: #choice: true, false, default: false -multiPhase: -#choice: true, false, default: false multiThread: tuner: #choice: TPE, Random, Anneal, Evolution @@ -171,8 +166,6 @@ trainingServicePlatform: #choice: true, false, default: false useAnnotation: #choice: true, false, default: false -multiPhase: -#choice: true, false, default: false multiThread: tuner: #choice: TPE, Random, Anneal, Evolution @@ -283,12 +276,6 @@ Use annotation to analysis trial code and generate search space. Note: if __useAnnotation__ is true, the searchSpacePath field should be removed. -### multiPhase - -Optional. Bool. Default: false. - -Enable [multi-phase experiment](../AdvancedFeature/MultiPhase.md). - ### multiThread Optional. Bool. Default: false. diff --git a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md index 613df37be3..00657f28e3 100644 --- a/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md +++ b/docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md @@ -67,10 +67,8 @@ It doesn't need to redeploy, but the nnictl may need to be restarted. #### TypeScript -* If `src/nni_manager` will be changed, run `yarn watch` continually under this folder. It will rebuild code instantly. -* If `src/webui` or `src/nasui` is changed, use **step 3** to rebuild code. - -The nnictl may need to be restarted. +* If `src/nni_manager` is changed, run `yarn watch` continually under this folder. It will rebuild code instantly. The nnictl may need to be restarted to reload NNI manager. +* If `src/webui` or `src/nasui` are changed, run `yarn start` under the corresponding folder. The web UI will refresh automatically if code is changed. --- diff --git a/docs/en_US/hpo_advanced.rst b/docs/en_US/hpo_advanced.rst index 1e04a7c722..50b9b236d2 100644 --- a/docs/en_US/hpo_advanced.rst +++ b/docs/en_US/hpo_advanced.rst @@ -4,7 +4,6 @@ Advanced Features .. toctree:: :maxdepth: 2 - Enable Multi-phase Write a New Tuner Write a New Assessor Write a New Advisor diff --git a/docs/zh_CN/AdvancedFeature/MultiPhase.md b/docs/zh_CN/AdvancedFeature/MultiPhase.md deleted file mode 100644 index 4b0a560180..0000000000 --- a/docs/zh_CN/AdvancedFeature/MultiPhase.md +++ /dev/null @@ -1,88 +0,0 @@ -# 多阶段 - -## 多阶段 Experiment - -通常,每个 Trial 任务只需要从 Tuner 获取一个配置(超参等),然后使用这个配置执行并报告结果,然后退出。 但有时,一个 Trial 任务可能需要从 Tuner 请求多次配置。 这是一个非常有用的功能。 例如: - -1. 在一些训练平台上,需要数十秒来启动一个任务。 如果一个配置只需要一分钟就能完成,那么每个 Trial 任务中只运行一个配置就会非常低效。 这种情况下,可以在同一个 Trial 任务中,完成一个配置后,再请求并完成另一个配置。 极端情况下,一个 Trial 任务可以运行无数个配置。 如果设置了并发(例如设为 6),那么就会有 6 个**长时间**运行的任务来不断尝试不同的配置。 - -2. 有些类型的模型需要进行多阶段的训练,而下一个阶段的配置依赖于前一个阶段的结果。 例如,为了找到模型最好的量化结果,训练过程通常为:自动量化算法(例如 NNI 中的 TunerJ)选择一个位宽(如 16 位), Trial 任务获得此配置,并训练数个 epoch,并返回结果(例如精度)。 算法收到结果后,决定是将 16 位改为 8 位,还是 32 位。 此过程会重复多次。 - -上述情况都可以通过多阶段执行的功能来支持。 为了支持这些情况,一个 Trial 任务需要能从 Tuner 请求多个配置。 Tuner 需要知道两次配置请求是否来自同一个 Trial 任务。 同时,多阶段中的 Trial 任务需要多次返回最终结果。 - -## 创建多阶段的 Experiment - -### 实现使用多阶段的 Trial 代码: - -**1. 更新 Trial 代码** - -Trial 代码中使用多阶段非常容易,示例如下: - -```python -# ... -for i in range(5): - # 从 Tuner 中获得参数 - tuner_param = nni.get_next_parameter() - # 如果没有更多超参可生成,nni.get_next_parameter 会返回 None。 - if tuner_param is None: - break - - # 使用参数 - # ... - # 返回最终结果 - nni.report_final_result() - # ... -# ... -``` - -在多阶段 Experiment 中,每次 API `nni.get_next_parameter()` 被调用时,会返回 Tuner 新生成的超参,然后 Trial 代码会使用新的超参,并返回其最终结果。 `nni.get_next_parameter()` 和 `nni.report_final_result()` 需要依次被调用:**先调用前者,然后调用后者,并按此顺序重复调用**。 如果 `nni.get_next_parameter()` 被连续多次调用,然后再调用 `nni.report_final_result()`,这会造成最终结果只会与 get_next_parameter 所返回的最后一个配置相关联。 因此,前面的 get_next_parameter 调用都没有关联的结果,这可能会造成一些多阶段算法出问题。 - -注意,如果 `nni.get_next_parameter` 返回 None,表示 Tuner 没有生成更多的超参。 - -**2. Experiment 配置** - -要启用多阶段,需要在 Experiment 的 YAML 配置文件中增加 `multiPhase: true`。 如果不添加此参数,`nni.get_next_parameter()` 会一直返回同样的配置。 - -多阶段 Experiment 配置示例: - -```yaml -authorName: default -experimentName: multiphase experiment -trialConcurrency: 2 -maxExecDuration: 1h -maxTrialNum: 8 -trainingServicePlatform: local -searchSpacePath: search_space.json -multiPhase: true -useAnnotation: false -tuner: - builtinTunerName: TPE - classArgs: - optimize_mode: maximize -trial: - command: python3 mytrial.py - codeDir: . - gpuNum: 0 -``` - -### 实现使用多阶段的 Tuner: - -强烈建议首先阅读[自定义 Tuner](https://nni.readthedocs.io/zh/latest/Tuner/CustomizeTuner.html),再开始实现多阶段 Tuner。 与普通 Tuner 一样,需要从 `Tuner` 类继承。 当通过配置启用多阶段时(将 `multiPhase` 设为 true),Tuner 会通过下列方法得到一个新的参数 `trial_job_id`: - -```text -generate_parameters -generate_multiple_parameters -receive_trial_result -receive_customized_trial_result -trial_end -``` - -有了这个信息, Tuner 能够知道哪个 Trial 在请求配置信息, 返回的结果是哪个 Trial 的。 通过此信息,Tuner 能够灵活的为不同的 Trial 及其阶段实现功能。 例如,可在 generate_parameters 方法中使用 trial_job_id 来为特定的 Trial 任务生成超参。 - -### 支持多阶段 Experiment 的 Tuner: - -[TPE](../Tuner/HyperoptTuner.md), [Random](../Tuner/HyperoptTuner.md), [Anneal](../Tuner/HyperoptTuner.md), [Evolution](../Tuner/EvolutionTuner.md), [SMAC](../Tuner/SmacTuner.md), [NetworkMorphism](../Tuner/NetworkmorphismTuner.md), [MetisTuner](../Tuner/MetisTuner.md), [BOHB](../Tuner/BohbAdvisor.md), [Hyperband](../Tuner/HyperbandAdvisor.md). - -### 支持多阶段 Experiment 的训练平台: - -[本机](../TrainingService/LocalMode.md), [远程计算机](../TrainingService/RemoteMachineMode.md), [OpenPAI](../TrainingService/PaiMode.md) \ No newline at end of file diff --git a/src/webui/src/components/Modals/ExperimentDrawer.tsx b/src/webui/src/components/Modals/ExperimentDrawer.tsx index e0b5594bf4..cee710c9fe 100644 --- a/src/webui/src/components/Modals/ExperimentDrawer.tsx +++ b/src/webui/src/components/Modals/ExperimentDrawer.tsx @@ -7,6 +7,7 @@ import { import { MANAGER_IP, DRAWEROPTION } from '../../static/const'; import MonacoEditor from 'react-monaco-editor'; import '../../static/style/logDrawer.scss'; +import { TrialManager } from '../../static/model/trialmanager'; interface ExpDrawerProps { isVisble: boolean; @@ -37,27 +38,27 @@ class ExperimentDrawer extends React.Component { axios.get(`${MANAGER_IP}/trial-jobs`), axios.get(`${MANAGER_IP}/metric-data`) ]) - .then(axios.spread((res, res1, res2) => { - if (res.status === 200 && res1.status === 200 && res2.status === 200) { - if (res.data.params.searchSpace) { - res.data.params.searchSpace = JSON.parse(res.data.params.searchSpace); + .then(axios.spread((resExperiment, resTrialJobs, resMetricData) => { + if (resExperiment.status === 200 && resTrialJobs.status === 200 && resMetricData.status === 200) { + if (resExperiment.data.params.searchSpace) { + resExperiment.data.params.searchSpace = JSON.parse(resExperiment.data.params.searchSpace); } - const trialMessagesArr = res1.data; - const interResultList = res2.data; + const trialMessagesArr = TrialManager.expandJobsToTrials(resTrialJobs.data); + const interResultList = resMetricData.data; Object.keys(trialMessagesArr).map(item => { // not deal with trial's hyperParameters const trialId = trialMessagesArr[item].id; // add intermediate result message trialMessagesArr[item].intermediate = []; Object.keys(interResultList).map(key => { - const interId = interResultList[key].trialJobId; + const interId = `${interResultList[key].trialJobId}-${interResultList[key].parameterId}`; if (trialId === interId) { trialMessagesArr[item].intermediate.push(interResultList[key]); } }); }); const result = { - experimentParameters: res.data, + experimentParameters: resExperiment.data, trialMessage: trialMessagesArr }; if (this._isCompareMount === true) { diff --git a/src/webui/src/components/Modals/Killjob.tsx b/src/webui/src/components/Modals/Killjob.tsx index 758da3b93c..580ff5ff24 100644 --- a/src/webui/src/components/Modals/Killjob.tsx +++ b/src/webui/src/components/Modals/Killjob.tsx @@ -77,7 +77,7 @@ class KillJob extends React.Component { onKill = (): void => { this.setState({ isCalloutVisible: false }, () => { const { trial } = this.props; - killJob(trial.key, trial.id, trial.status); + killJob(trial.key, trial.jobId, trial.status); }); } @@ -127,4 +127,4 @@ class KillJob extends React.Component { } } -export default KillJob; \ No newline at end of file +export default KillJob; diff --git a/src/webui/src/components/public-child/OpenRow.tsx b/src/webui/src/components/public-child/OpenRow.tsx index a2679b836e..a0c6c274c1 100644 --- a/src/webui/src/components/public-child/OpenRow.tsx +++ b/src/webui/src/components/public-child/OpenRow.tsx @@ -3,7 +3,6 @@ import * as copy from 'copy-to-clipboard'; import { Stack, PrimaryButton, Pivot, PivotItem } from 'office-ui-fabric-react'; import { Trial } from '../../static/model/trial'; import { EXPERIMENT, TRIALS } from '../../static/datamodel'; -import { MANAGER_IP } from '../../static/const'; import JSONTree from 'react-json-tree'; import PaiTrialLog from '../public-child/PaiTrialLog'; import TrialLog from '../public-child/TrialLog'; @@ -60,31 +59,12 @@ class OpenRow extends React.Component { const { isHidenInfo, typeInfo, info } = this.state; const trialId = this.props.trialId; const trial = TRIALS.getTrial(trialId); - const trialLink: string = `${MANAGER_IP}/trial-jobs/${trialId}`; const logPathRow = trial.info.logPath || 'This trial\'s log path is not available.'; - const multiProgress = trial.info.hyperParameters === undefined ? 0 : trial.info.hyperParameters.length; return ( - { - EXPERIMENT.multiPhase - ? - - { - ` - Trails for multiphase experiment will return a set of parameters, - we are listing the latest parameter in webportal. - For the entire parameter set, please refer to the following " - ` - } - {trialLink}{`".`} -
Current Phase: {multiProgress}.
-
- : - null - } { trial.info.hyperParameters !== undefined ? diff --git a/src/webui/src/components/trial-detail/TableList.tsx b/src/webui/src/components/trial-detail/TableList.tsx index c34aa8fb20..00a89f086a 100644 --- a/src/webui/src/components/trial-detail/TableList.tsx +++ b/src/webui/src/components/trial-detail/TableList.tsx @@ -9,7 +9,7 @@ import { LineChart, blocked, copy } from '../Buttons/Icon'; import { MANAGER_IP, COLUMNPro } from '../../static/const'; import { convertDuration, formatTimestamp, intermediateGraphOption, parseMetrics } from '../../static/function'; import { EXPERIMENT, TRIALS } from '../../static/datamodel'; -import { TableRecord } from '../../static/interface'; +import { TableRecord, TrialJobInfo } from '../../static/interface'; import Details from '../overview/Details'; import ChangeColumnComponent from '../Modals/ChangeColumnComponent'; import Compare from '../Modals/Compare'; @@ -231,18 +231,23 @@ class TableList extends React.Component { ) }; - showIntermediateModal = async (id: string, event: React.SyntheticEvent): Promise => { + showIntermediateModal = async (record: TrialJobInfo, event: React.SyntheticEvent): Promise => { event.preventDefault(); event.stopPropagation(); - const res = await axios.get(`${MANAGER_IP}/metric-data/${id}`); + const res = await axios.get(`${MANAGER_IP}/metric-data/${record.jobId}`); if (res.status === 200) { const intermediateArr: number[] = []; // support intermediate result is dict because the last intermediate result is // final result in a succeed trial, it may be a dict. // get intermediate result dict keys array const { intermediateKey } = this.state; - const otherkeys: string[] = [ ]; - if (res.data.length !== 0) { + const otherkeys: string[] = []; + // One trial job may contains multiple parameter id + // only show current trial's metric data + const metricDatas = res.data.filter(item => { + return item.parameterId == record.parameterId; + }); + if (metricDatas.length !== 0) { // just add type=number keys const intermediateMetrics = parseMetrics(res.data[0].data); for (const key in intermediateMetrics) { @@ -252,9 +257,10 @@ class TableList extends React.Component { } } // intermediateArr just store default val - Object.keys(res.data).map(item => { - if (res.data[item].type === 'PERIODICAL') { - const temp = parseMetrics(res.data[item].data); + metricDatas.map(item => { + + if (item.type === 'PERIODICAL') { + const temp = parseMetrics(item.data); if (typeof temp === 'object') { intermediateArr.push(temp[intermediateKey]); } else { @@ -262,12 +268,12 @@ class TableList extends React.Component { } } }); - const intermediate = intermediateGraphOption(intermediateArr, id); + const intermediate = intermediateGraphOption(intermediateArr, record.id); this.setState({ intermediateData: res.data, // store origin intermediate data for a trial intermediateOption: intermediate, intermediateOtherKeys: otherkeys, - intermediateId: id + intermediateId: record.id }); } this.setState({ modalVisible: true }); @@ -426,8 +432,6 @@ class TableList extends React.Component { // when user click [Add Column] need to use the function private initTableColumnList = (columnList: string[]): IColumn[] => { // const { columnList } = this.props; - // [supportCustomizedTrial: true] - const supportCustomizedTrial = (EXPERIMENT.multiPhase === true) ? false : true; const disabledAddCustomizedTrial = ['DONE', 'ERROR', 'STOPPED'].includes(EXPERIMENT.status); const showColumn: IColumn[] = []; for (const item of columnList) { @@ -479,7 +483,7 @@ class TableList extends React.Component { {LineChart} @@ -494,20 +498,14 @@ class TableList extends React.Component { } {/* Add a new trial-customized trial */} - { - supportCustomizedTrial - ? - - {copy} - - : - null - } + + {copy} +
); }, @@ -659,4 +657,4 @@ class TableList extends React.Component { } } -export default TableList; \ No newline at end of file +export default TableList; diff --git a/src/webui/src/static/const.ts b/src/webui/src/static/const.ts index 1f82d3210d..4afc70acd7 100644 --- a/src/webui/src/static/const.ts +++ b/src/webui/src/static/const.ts @@ -2,7 +2,10 @@ const METRIC_GROUP_UPDATE_THRESHOLD = 100; const METRIC_GROUP_UPDATE_SIZE = 20; -const MANAGER_IP = `/api/v1/nni`; +let MANAGER_IP = `/api/v1/nni`; +if (process.env.NODE_ENV == "development") { + MANAGER_IP = `//${window.location.hostname}:8080` + MANAGER_IP; +} const DOWNLOAD_IP = `/logs`; const WEBUIDOC = 'https://nni.readthedocs.io/en/latest/Tutorial/WebUI.html'; const trialJobStatus = [ @@ -34,8 +37,8 @@ const OPERATION = 'Operation'; const COLUMN = ['Trial No.', 'ID', 'Duration', 'Status', 'Default', OPERATION]; // all choice column !dictory final const COLUMNPro = ['Trial No.', 'ID', 'Start Time', 'End Time', 'Duration', 'Status', -'Intermediate result', 'Default', OPERATION]; -const CONCURRENCYTOOLTIP = 'Trial concurrency is the number of trials running concurrently.'; + 'Intermediate result', 'Default', OPERATION]; +const CONCURRENCYTOOLTIP = 'Trial concurrency is the number of trials running concurrently.'; export { MANAGER_IP, DOWNLOAD_IP, trialJobStatus, COLUMNPro, WEBUIDOC, diff --git a/src/webui/src/static/interface.ts b/src/webui/src/static/interface.ts index 5254c3702e..3ed466ca19 100644 --- a/src/webui/src/static/interface.ts +++ b/src/webui/src/static/interface.ts @@ -18,6 +18,8 @@ interface TableRecord { startTime: number; endTime?: number; id: string; + jobId: string; + parameterId: string; duration: number; status: string; intermediateCount: number; @@ -99,6 +101,7 @@ interface Intermedia { interface MetricDataRecord { timestamp: number; trialJobId: string; + trialId: string; parameterId: string; type: string; sequence: number; @@ -107,6 +110,8 @@ interface MetricDataRecord { interface TrialJobInfo { id: string; + jobId: string; + parameterId: string; sequenceId: number; status: string; startTime?: number; @@ -126,7 +131,6 @@ interface ExperimentParams { maxTrialNum: number; searchSpace: string; trainingServicePlatform: string; - multiPhase?: boolean; multiThread?: boolean; versionCheck?: boolean; logCollection?: string; @@ -189,4 +193,4 @@ export { AccurPoint, DetailAccurPoint, TooltipForIntermediate, TooltipForAccuracy, Dimobj, ParaObj, Intermedia, MetricDataRecord, TrialJobInfo, ExperimentParams, ExperimentProfile, NNIManagerStatus, EventMap -}; \ No newline at end of file +}; diff --git a/src/webui/src/static/model/experiment.ts b/src/webui/src/static/model/experiment.ts index f1beffda0a..62efe4a99e 100644 --- a/src/webui/src/static/model/experiment.ts +++ b/src/webui/src/static/model/experiment.ts @@ -66,10 +66,6 @@ class Experiment { return !!(this.profile.params.logCollection && this.profile.params.logCollection !== 'none'); } - get multiPhase(): boolean { - return !!(this.profile.params.multiPhase); - } - get status(): string { if (!this.statusField) { throw Error('Experiment status not initialized'); diff --git a/src/webui/src/static/model/trial.ts b/src/webui/src/static/model/trial.ts index f5a4f3f9ef..da3b982377 100644 --- a/src/webui/src/static/model/trial.ts +++ b/src/webui/src/static/model/trial.ts @@ -4,7 +4,7 @@ import { getFinal, formatAccuracy, metricAccuracy, parseMetrics, isArrayType } f class Trial implements TableObj { private metricsInitialized: boolean = false; private infoField: TrialJobInfo | undefined; - private intermediates: (MetricDataRecord | undefined)[] = [ ]; + private intermediates: (MetricDataRecord | undefined)[] = []; public final: MetricDataRecord | undefined; private finalAcc: number | undefined; @@ -29,7 +29,7 @@ class Trial implements TableObj { } get intermediateMetrics(): MetricDataRecord[] { - const ret: MetricDataRecord[] = [ ]; + const ret: MetricDataRecord[] = []; for (let i = 0; i < this.intermediates.length; i++) { if (this.intermediates[i]) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion @@ -80,6 +80,8 @@ class Trial implements TableObj { key: this.info.id, sequenceId: this.info.sequenceId, id: this.info.id, + jobId: this.info.jobId, + parameterId: this.info.parameterId, // eslint-disable-next-line @typescript-eslint/no-non-null-assertion startTime: this.info.startTime!, endTime: this.info.endTime, @@ -122,8 +124,8 @@ class Trial implements TableObj { get description(): Parameters { const ret: Parameters = { - parameters: { }, - intermediate: [ ], + parameters: {}, + intermediate: [], multiProgress: 1 }; const tempHyper = this.info.hyperParameters; @@ -142,7 +144,7 @@ class Trial implements TableObj { ret.logPath = this.info.logPath; } - const mediate: number[] = [ ]; + const mediate: number[] = []; for (const items of this.intermediateMetrics) { if (typeof parseMetrics(items.data) === 'object') { mediate.push(parseMetrics(items.data).default); diff --git a/src/webui/src/static/model/trialmanager.ts b/src/webui/src/static/model/trialmanager.ts index 3e742995da..6c9ced1144 100644 --- a/src/webui/src/static/model/trialmanager.ts +++ b/src/webui/src/static/model/trialmanager.ts @@ -6,13 +6,29 @@ import { Trial } from './trial'; function groupMetricsByTrial(metrics: MetricDataRecord[]): Map { const ret = new Map(); for (const metric of metrics) { - if (ret.has(metric.trialJobId)) { + const trialId = `${metric.trialJobId}-${metric.parameterId}`; + metric.trialId = trialId; + if (ret.has(trialId)) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - ret.get(metric.trialJobId)!.push(metric); + ret.get(trialId)!.push(metric); } else { - ret.set(metric.trialJobId, [ metric ]); + ret.set(trialId, [metric]); } } + // to compatiable with multi-trial in same job, fix offset of sequence + ret.forEach((trialMetrics) => { + let minSequenceNumber = Number.POSITIVE_INFINITY; + trialMetrics.map((item) => { + if (item.sequence < minSequenceNumber && item.type !== "FINAL") { + minSequenceNumber = item.sequence; + } + }); + trialMetrics.map((item) => { + if (item.type !== "FINAL") { + item.sequence -= minSequenceNumber; + } + }); + }); return ret; } @@ -31,7 +47,7 @@ class TrialManager { } public async update(lastTime?: boolean): Promise { - const [ infoUpdated, metricUpdated ] = await Promise.all([ this.updateInfo(), this.updateMetrics(lastTime) ]); + const [infoUpdated, metricUpdated] = await Promise.all([this.updateInfo(), this.updateMetrics(lastTime)]); return infoUpdated || metricUpdated; } @@ -71,14 +87,14 @@ class TrialManager { public countStatus(): Map { const cnt = new Map([ - [ 'UNKNOWN', 0 ], - [ 'WAITING', 0 ], - [ 'RUNNING', 0 ], - [ 'SUCCEEDED', 0 ], - [ 'FAILED', 0 ], - [ 'USER_CANCELED', 0 ], - [ 'SYS_CANCELED', 0 ], - [ 'EARLY_STOPPED', 0 ], + ['UNKNOWN', 0], + ['WAITING', 0], + ['RUNNING', 0], + ['SUCCEEDED', 0], + ['FAILED', 0], + ['USER_CANCELED', 0], + ['SYS_CANCELED', 0], + ['EARLY_STOPPED', 0], ]); for (const trial of this.trials.values()) { if (trial.initialized()) { @@ -89,19 +105,71 @@ class TrialManager { return cnt; } + public static expandJobsToTrials(jobs: TrialJobInfo[]): TrialJobInfo[] { + const trials: TrialJobInfo[] = []; + + for (const jobInfo of jobs as TrialJobInfo[]) { + if (jobInfo.hyperParameters) { + let trial: TrialJobInfo | undefined; + let lastTrial: TrialJobInfo | undefined; + for (let i = 0; i < jobInfo.hyperParameters.length; i++) { + const hyperParameters = jobInfo.hyperParameters[i] + const hpObject = JSON.parse(hyperParameters); + const parameterId = hpObject["parameter_id"]; + trial = { + id: `${jobInfo.id}-${parameterId}`, + jobId: jobInfo.id, + parameterId: parameterId, + sequenceId: parameterId, + status: "SUCCEEDED", + startTime: jobInfo.startTime, + endTime: jobInfo.startTime, + hyperParameters: [hyperParameters], + logPath: jobInfo.logPath, + stderrPath: jobInfo.stderrPath, + }; + if (jobInfo.finalMetricData) { + for (const metricData of jobInfo.finalMetricData) { + if (metricData.parameterId == parameterId) { + trial.finalMetricData = [metricData]; + trial.endTime = metricData.timestamp; + break; + } + } + } + if (lastTrial) { + trial.startTime = lastTrial.endTime; + } else { + trial.startTime = jobInfo.startTime; + } + lastTrial = trial; + trials.push(trial); + } + if (lastTrial !== undefined) { + lastTrial.status = jobInfo.status; + lastTrial.endTime = jobInfo.endTime; + } + } else { + trials.push(jobInfo); + } + } + return trials; + } + private async updateInfo(): Promise { const response = await axios.get(`${MANAGER_IP}/trial-jobs`); let updated = false; if (response.status === 200) { - for (const info of response.data as TrialJobInfo[]) { - if (this.trials.has(info.id)) { + const newTrials = TrialManager.expandJobsToTrials(response.data); + for (const trialInfo of newTrials as TrialJobInfo[]) { + if (this.trials.has(trialInfo.id)) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion - updated = this.trials.get(info.id)!.updateTrialJobInfo(info) || updated; + updated = this.trials.get(trialInfo.id)!.updateTrialJobInfo(trialInfo) || updated; } else { - this.trials.set(info.id, new Trial(info, undefined)); + this.trials.set(trialInfo.id, new Trial(trialInfo, undefined)); updated = true; } - this.maxSequenceId = Math.max(this.maxSequenceId, info.sequenceId); + this.maxSequenceId = Math.max(this.maxSequenceId, trialInfo.sequenceId); } this.infoInitialized = true; } @@ -146,7 +214,7 @@ class TrialManager { private doUpdateMetrics(allMetrics: MetricDataRecord[], latestOnly: boolean): boolean { let updated = false; - for (const [ trialId, metrics ] of groupMetricsByTrial(allMetrics).entries()) { + for (const [trialId, metrics] of groupMetricsByTrial(allMetrics).entries()) { if (this.trials.has(trialId)) { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const trial = this.trials.get(trialId)!; From 3efc59ee494eaefbc3fc3edd344766129d958ef4 Mon Sep 17 00:00:00 2001 From: QuanluZhang Date: Mon, 11 May 2020 10:42:20 +0800 Subject: [PATCH 6/8] improve PBT tuner (#2357) --- src/sdk/pynni/nni/pbt_tuner/pbt_tuner.py | 173 ++++++++++++++++----- src/sdk/pynni/tests/test_builtin_tuners.py | 57 +++++++ 2 files changed, 189 insertions(+), 41 deletions(-) diff --git a/src/sdk/pynni/nni/pbt_tuner/pbt_tuner.py b/src/sdk/pynni/nni/pbt_tuner/pbt_tuner.py index e943752e84..f1a0189aad 100755 --- a/src/sdk/pynni/nni/pbt_tuner/pbt_tuner.py +++ b/src/sdk/pynni/nni/pbt_tuner/pbt_tuner.py @@ -74,18 +74,16 @@ def exploit_and_explore(bot_trial_info, top_trial_info, factor, resample_probabi top_hyper_parameters = top_trial_info.hyper_parameters hyper_parameters = copy.deepcopy(top_hyper_parameters) random_state = np.random.RandomState() + hyper_parameters['load_checkpoint_dir'] = hyper_parameters['save_checkpoint_dir'] + hyper_parameters['save_checkpoint_dir'] = os.path.join(bot_checkpoint_dir, str(epoch)) for key in hyper_parameters.keys(): hyper_parameter = hyper_parameters[key] - if key == 'load_checkpoint_dir': - hyper_parameters[key] = hyper_parameters['save_checkpoint_dir'] - continue - elif key == 'save_checkpoint_dir': - hyper_parameters[key] = os.path.join(bot_checkpoint_dir, str(epoch)) + if key == 'load_checkpoint_dir' or key == 'save_checkpoint_dir': continue elif search_space[key]["_type"] == "choice": choices = search_space[key]["_value"] - ub, uv = len(choices) - 1, choices.index(hyper_parameter["_value"]) + 1 - lb, lv = 0, choices.index(hyper_parameter["_value"]) - 1 + ub, uv = len(choices) - 1, choices.index(hyper_parameter) + 1 + lb, lv = 0, choices.index(hyper_parameter) - 1 elif search_space[key]["_type"] == "randint": lb, ub = search_space[key]["_value"][:2] ub -= 1 @@ -132,10 +130,11 @@ def exploit_and_explore(bot_trial_info, top_trial_info, factor, resample_probabi else: logger.warning("Illegal type to perturb: %s", search_space[key]["_type"]) continue + if search_space[key]["_type"] == "choice": idx = perturbation(search_space[key]["_type"], search_space[key]["_value"], resample_probability, uv, ub, lv, lb, random_state) - hyper_parameters[key] = {'_index': idx, '_value': choices[idx]} + hyper_parameters[key] = choices[idx] else: hyper_parameters[key] = perturbation(search_space[key]["_type"], search_space[key]["_value"], resample_probability, uv, ub, lv, lb, random_state) @@ -231,6 +230,7 @@ def update_search_space(self, search_space): for i in range(self.population_size): hyper_parameters = json2parameter( self.searchspace_json, is_rand, self.random_state) + hyper_parameters = split_index(hyper_parameters) checkpoint_dir = os.path.join(self.all_checkpoint_dir, str(i)) hyper_parameters['load_checkpoint_dir'] = os.path.join(checkpoint_dir, str(self.epoch)) hyper_parameters['save_checkpoint_dir'] = os.path.join(checkpoint_dir, str(self.epoch)) @@ -294,7 +294,42 @@ def generate_parameters(self, parameter_id, **kwargs): trial_info.parameter_id = parameter_id self.running[parameter_id] = trial_info logger.info('Generate parameter : %s', trial_info.hyper_parameters) - return split_index(trial_info.hyper_parameters) + return trial_info.hyper_parameters + + def _proceed_next_epoch(self): + """ + """ + logger.info('Proceeding to next epoch') + self.epoch += 1 + self.population = [] + self.pos = -1 + self.running = {} + #exploit and explore + reverse = True if self.optimize_mode == OptimizeMode.Maximize else False + self.finished = sorted(self.finished, key=lambda x: x.score, reverse=reverse) + cutoff = int(np.ceil(self.fraction * len(self.finished))) + tops = self.finished[:cutoff] + bottoms = self.finished[self.finished_trials - cutoff:] + for bottom in bottoms: + top = np.random.choice(tops) + exploit_and_explore(bottom, top, self.factor, self.resample_probability, self.epoch, self.searchspace_json) + for trial in self.finished: + if trial not in bottoms: + trial.clean_id() + trial.hyper_parameters['load_checkpoint_dir'] = trial.hyper_parameters['save_checkpoint_dir'] + trial.hyper_parameters['save_checkpoint_dir'] = os.path.join(trial.checkpoint_dir, str(self.epoch)) + self.finished_trials = 0 + for _ in range(self.population_size): + trial_info = self.finished.pop() + self.population.append(trial_info) + while self.credit > 0 and self.pos + 1 < len(self.population): + self.credit -= 1 + self.pos += 1 + parameter_id = self.param_ids.pop() + trial_info = self.population[self.pos] + trial_info.parameter_id = parameter_id + self.running[parameter_id] = trial_info + self.send_trial_callback(parameter_id, trial_info.hyper_parameters) def receive_trial_result(self, parameter_id, parameters, value, **kwargs): """ @@ -312,43 +347,99 @@ def receive_trial_result(self, parameter_id, parameters, value, **kwargs): """ logger.info('Get one trial result, id = %d, value = %s', parameter_id, value) value = extract_scalar_reward(value) + trial_info = self.running.pop(parameter_id, None) + trial_info.score = value + self.finished.append(trial_info) + self.finished_trials += 1 + if self.finished_trials == self.population_size: + self._proceed_next_epoch() + + def trial_end(self, parameter_id, success, **kwargs): + """ + Deal with trial failure + + Parameters + ---------- + parameter_id : int + Unique identifier for hyper-parameters used by this trial. + success : bool + True if the trial successfully completed; False if failed or terminated. + **kwargs + Unstable parameters which should be ignored by normal users. + """ + if success: + return if self.optimize_mode == OptimizeMode.Minimize: - value = -value + value = float('inf') + else: + value = float('-inf') trial_info = self.running.pop(parameter_id, None) trial_info.score = value self.finished.append(trial_info) self.finished_trials += 1 if self.finished_trials == self.population_size: - logger.info('Proceeding to next epoch') - self.epoch += 1 - self.population = [] - self.pos = -1 - self.running = {} - #exploit and explore - self.finished = sorted(self.finished, key=lambda x: x.score, reverse=True) - cutoff = int(np.ceil(self.fraction * len(self.finished))) - tops = self.finished[:cutoff] - bottoms = self.finished[self.finished_trials - cutoff:] - for bottom in bottoms: - top = np.random.choice(tops) - exploit_and_explore(bottom, top, self.factor, self.resample_probability, self.epoch, self.searchspace_json) - for trial in self.finished: - if trial not in bottoms: - trial.clean_id() - trial.hyper_parameters['load_checkpoint_dir'] = trial.hyper_parameters['save_checkpoint_dir'] - trial.hyper_parameters['save_checkpoint_dir'] = os.path.join(trial.checkpoint_dir, str(self.epoch)) - self.finished_trials = 0 - for _ in range(self.population_size): - trial_info = self.finished.pop() - self.population.append(trial_info) - while self.credit > 0 and self.pos + 1 < len(self.population): - self.credit -= 1 - self.pos += 1 - parameter_id = self.param_ids.pop() - trial_info = self.population[self.pos] - trial_info.parameter_id = parameter_id - self.running[parameter_id] = trial_info - self.send_trial_callback(parameter_id, split_index(trial_info.hyper_parameters)) + self._proceed_next_epoch() def import_data(self, data): - pass + """ + Parameters + ---------- + data : json obj + imported data records + + Returns + ------- + int + the start epoch number after data imported, only used for unittest + """ + if self.running: + logger.warning("Do not support importing data in the middle of experiment") + return + # the following is for experiment resume + _completed_num = 0 + epoch_data_dict = {} + for trial_info in data: + logger.info("Process data record %s / %s", _completed_num, len(data)) + _completed_num += 1 + # simply validate data format + _params = trial_info["parameter"] + _value = trial_info['value'] + # assign fake value for failed trials + if not _value: + logger.info("Useless trial data, value is %s, skip this trial data.", _value) + _value = float('inf') if self.optimize_mode == OptimizeMode.Minimize else float('-inf') + _value = extract_scalar_reward(_value) + if 'save_checkpoint_dir' not in _params: + logger.warning("Invalid data record: save_checkpoint_dir is missing, abandon data import.") + return + epoch_num = int(os.path.basename(_params['save_checkpoint_dir'])) + if epoch_num not in epoch_data_dict: + epoch_data_dict[epoch_num] = [] + epoch_data_dict[epoch_num].append((_params, _value)) + if not epoch_data_dict: + logger.warning("No valid epochs, abandon data import.") + return + # figure out start epoch for resume + max_epoch_num = max(epoch_data_dict, key=int) + if len(epoch_data_dict[max_epoch_num]) < self.population_size: + max_epoch_num -= 1 + # If there is no a single complete round, no data to import, start from scratch + if max_epoch_num < 0: + logger.warning("No completed epoch, abandon data import.") + return + assert len(epoch_data_dict[max_epoch_num]) == self.population_size + # check existence of trial save checkpoint dir + for params, _ in epoch_data_dict[max_epoch_num]: + if not os.path.isdir(params['save_checkpoint_dir']): + logger.warning("save_checkpoint_dir %s does not exist, data will not be resumed", params['save_checkpoint_dir']) + return + # resume data + self.epoch = max_epoch_num + self.finished_trials = self.population_size + for params, value in epoch_data_dict[max_epoch_num]: + checkpoint_dir = os.path.dirname(params['save_checkpoint_dir']) + self.finished.append(TrialInfo(checkpoint_dir=checkpoint_dir, hyper_parameters=params, score=value)) + self._proceed_next_epoch() + logger.info("Successfully import data to PBT tuner, total data: %d, imported data: %d.", len(data), self.population_size) + logger.info("Start from epoch %d ...", self.epoch) + return self.epoch # return for test diff --git a/src/sdk/pynni/tests/test_builtin_tuners.py b/src/sdk/pynni/tests/test_builtin_tuners.py index de0c130403..16ae6e78ef 100644 --- a/src/sdk/pynni/tests/test_builtin_tuners.py +++ b/src/sdk/pynni/tests/test_builtin_tuners.py @@ -159,6 +159,62 @@ def search_space_test_all(self, tuner_factory, supported_types=None, ignore_type logger.info("Full supported search space: %s", full_supported_search_space) self.search_space_test_one(tuner_factory, full_supported_search_space) + def import_data_test_for_pbt(self): + """ + test1: import data with complete epoch + test2: import data with incomplete epoch + """ + search_space = { + "choice_str": { + "_type": "choice", + "_value": ["cat", "dog", "elephant", "cow", "sheep", "panda"] + } + } + all_checkpoint_dir = os.path.expanduser("~/nni/checkpoint/test/") + population_size = 4 + # ===import data at the beginning=== + tuner = PBTTuner( + all_checkpoint_dir=all_checkpoint_dir, + population_size=population_size + ) + self.assertIsInstance(tuner, Tuner) + tuner.update_search_space(search_space) + save_dirs = [os.path.join(all_checkpoint_dir, str(i), str(0)) for i in range(population_size)] + # create save checkpoint directory + for save_dir in save_dirs: + os.makedirs(save_dir, exist_ok=True) + # for simplicity, omit "load_checkpoint_dir" + data = [{"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[0]}, "value": 1.1}, + {"parameter": {"choice_str": "dog", "save_checkpoint_dir": save_dirs[1]}, "value": {"default": 1.2, "tmp": 2}}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[2]}, "value": 11}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[3]}, "value": 7}] + epoch = tuner.import_data(data) + self.assertEqual(epoch, 1) + logger.info("Imported data successfully at the beginning") + shutil.rmtree(all_checkpoint_dir) + # ===import another data at the beginning, test the case when there is an incompleted epoch=== + tuner = PBTTuner( + all_checkpoint_dir=all_checkpoint_dir, + population_size=population_size + ) + self.assertIsInstance(tuner, Tuner) + tuner.update_search_space(search_space) + for i in range(population_size - 1): + save_dirs.append(os.path.join(all_checkpoint_dir, str(i), str(1))) + for save_dir in save_dirs: + os.makedirs(save_dir, exist_ok=True) + data = [{"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[0]}, "value": 1.1}, + {"parameter": {"choice_str": "dog", "save_checkpoint_dir": save_dirs[1]}, "value": {"default": 1.2, "tmp": 2}}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[2]}, "value": 11}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[3]}, "value": 7}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[4]}, "value": 1.1}, + {"parameter": {"choice_str": "dog", "save_checkpoint_dir": save_dirs[5]}, "value": {"default": 1.2, "tmp": 2}}, + {"parameter": {"choice_str": "cat", "save_checkpoint_dir": save_dirs[6]}, "value": 11}] + epoch = tuner.import_data(data) + self.assertEqual(epoch, 1) + logger.info("Imported data successfully at the beginning with incomplete epoch") + shutil.rmtree(all_checkpoint_dir) + def import_data_test(self, tuner_factory, stype="choice_str"): """ import data at the beginning with number value and dict value @@ -297,6 +353,7 @@ def test_pbt(self): all_checkpoint_dir=os.path.expanduser("~/nni/checkpoint/test/"), population_size=100 )) + self.import_data_test_for_pbt() def tearDown(self): file_list = glob.glob("smac3*") + ["param_config_space.pcs", "scenario.txt", "model_path"] From af80021301485595e81867eb000d06d78d4c0669 Mon Sep 17 00:00:00 2001 From: Lijiaoa <61399850+Lijiaoa@users.noreply.github.com> Date: Mon, 11 May 2020 10:52:15 +0800 Subject: [PATCH 7/8] refactor best trials (#2417) Co-authored-by: Lijiao <15910218274@163.com> --- src/webui/src/App.scss | 13 ++- src/webui/src/App.tsx | 14 ++- src/webui/src/components/Overview.tsx | 60 ++++++++---- src/webui/src/components/TrialsDetail.tsx | 91 ++++++++++--------- .../src/components/overview/BasicInfo.tsx | 6 ++ src/webui/src/components/overview/Title1.tsx | 8 +- src/webui/src/static/style/overview.scss | 1 - src/webui/src/static/style/overviewTitle.scss | 13 +-- src/webui/src/static/style/trialsDetail.scss | 14 +-- 9 files changed, 128 insertions(+), 92 deletions(-) diff --git a/src/webui/src/App.scss b/src/webui/src/App.scss index fdd51cde0c..aff42f8e6a 100644 --- a/src/webui/src/App.scss +++ b/src/webui/src/App.scss @@ -29,7 +29,18 @@ margin: 0 auto; margin-top: 74px; margin-bottom: 30px; - background: #fff; +} + +.bottomDiv{ + margin-bottom: 10px; +} + +.bgNNI{ + background-color: #fff; +} + +.borderRight{ + margin-right: 10px; } /* office-fabric-ui */ diff --git a/src/webui/src/App.tsx b/src/webui/src/App.tsx index 2188ef2447..7b57e07d69 100644 --- a/src/webui/src/App.tsx +++ b/src/webui/src/App.tsx @@ -14,6 +14,7 @@ interface AppState { metricGraphMode: 'max' | 'min'; // tuner's optimize_mode filed isillegalFinal: boolean; expWarningMessage: string; + bestTrialEntries: string; // for overview page: best trial entreis } class App extends React.Component<{}, AppState> { @@ -30,7 +31,8 @@ class App extends React.Component<{}, AppState> { trialsUpdateBroadcast: 0, metricGraphMode: 'max', isillegalFinal: false, - expWarningMessage: '' + expWarningMessage: '', + bestTrialEntries: '10' }; } @@ -92,9 +94,14 @@ class App extends React.Component<{}, AppState> { this.setState({ metricGraphMode: val }); } + // overview best trial module + changeEntries = (entries: string): void => { + this.setState({bestTrialEntries: entries}); + } + render(): React.ReactNode { const { interval, columnList, experimentUpdateBroadcast, trialsUpdateBroadcast, - metricGraphMode, isillegalFinal, expWarningMessage + metricGraphMode, isillegalFinal, expWarningMessage, bestTrialEntries } = this.state; if (experimentUpdateBroadcast === 0 || trialsUpdateBroadcast === 0) { return null; // TODO: render a loading page @@ -106,7 +113,8 @@ class App extends React.Component<{}, AppState> { columnList, changeColumn: this.changeColumn, experimentUpdateBroadcast, trialsUpdateBroadcast, - metricGraphMode, changeMetricGraphMode: this.changeMetricGraphMode + metricGraphMode, changeMetricGraphMode: this.changeMetricGraphMode, + bestTrialEntries, changeEntries: this.changeEntries }) ); diff --git a/src/webui/src/components/Overview.tsx b/src/webui/src/components/Overview.tsx index 1efd1a35c5..6e39feac3b 100644 --- a/src/webui/src/components/Overview.tsx +++ b/src/webui/src/components/Overview.tsx @@ -1,5 +1,5 @@ import * as React from 'react'; -import { Stack, IStackTokens } from 'office-ui-fabric-react'; +import { Stack, IStackTokens, Dropdown } from 'office-ui-fabric-react'; import { EXPERIMENT, TRIALS } from '../static/datamodel'; import { Trial } from '../static/model/trial'; import Title1 from './overview/Title1'; @@ -16,7 +16,9 @@ interface OverviewProps { experimentUpdateBroadcast: number; trialsUpdateBroadcast: number; metricGraphMode: 'max' | 'min'; + bestTrialEntries: string; changeMetricGraphMode: (val: 'max' | 'min') => void; + changeEntries: (entries: string) => void; } interface OverviewState { @@ -38,6 +40,7 @@ class Overview extends React.Component { changeMetricGraphMode('max'); } + clickMinTop = (event: React.SyntheticEvent): void => { event.stopPropagation(); const { changeMetricGraphMode } = this.props; @@ -48,9 +51,16 @@ class Overview extends React.Component { this.setState({ trialConcurrency: val }); } + // updateEntries = (event: React.FormEvent, item: IDropdownOption | undefined): void => { + updateEntries = (event: React.FormEvent, item: any): void => { + if (item !== undefined) { + this.props.changeEntries(item.key); + } + } + render(): React.ReactNode { const { trialConcurrency } = this.state; - const { experimentUpdateBroadcast, metricGraphMode } = this.props; + const { experimentUpdateBroadcast, metricGraphMode, bestTrialEntries } = this.props; const searchSpace = this.convertSearchSpace(); const bestTrials = this.findBestTrials(); // eslint-disable-next-line @typescript-eslint/no-non-null-assertion @@ -58,23 +68,31 @@ class Overview extends React.Component { const accuracyGraphData = this.generateAccuracyGraph(bestTrials); const noDataMessage = bestTrials.length > 0 ? '' : 'No data'; - const titleMaxbgcolor = (metricGraphMode === 'max' ? '#999' : '#b3b3b3'); - const titleMinbgcolor = (metricGraphMode === 'min' ? '#999' : '#b3b3b3'); + const titleMaxbgcolor = (metricGraphMode === 'max' ? '#333' : '#b3b3b3'); + const titleMinbgcolor = (metricGraphMode === 'min' ? '#333' : '#b3b3b3'); const stackTokens: IStackTokens = { childrenGap: 30, }; + + const entriesOption = [ + { key: '10', text: 'Display top 10 trials' }, + { key: '20', text: 'Display top 20 trials' }, + { key: '30', text: 'Display top 30 trials' }, + { key: '50', text: 'Display top 50 trials' }, + { key: '100', text: 'Display top 100 trials' } + ]; return (
{/* status and experiment block */} - + - + {/* status block */} - + { /> {/* experiment parameters search space tuner assessor... */} - + - + {/* */} + {/* the scroll bar all the trial profile in the searchSpace div*/} @@ -104,19 +123,27 @@ class Overview extends React.Component { - - + +
- +
- + +
+
+
@@ -128,7 +155,7 @@ class Overview extends React.Component { />
- trial.info.id)} /> + trial.info.id)} />
@@ -155,10 +182,11 @@ class Overview extends React.Component { private findBestTrials(): Trial[] { const bestTrials = TRIALS.sort(); + const { bestTrialEntries } = this.props; if (this.props.metricGraphMode === 'max') { - bestTrials.reverse().splice(10); + bestTrials.reverse().splice(JSON.parse(bestTrialEntries)); } else { - bestTrials.splice(10); + bestTrials.splice(JSON.parse(bestTrialEntries)); } return bestTrials; } diff --git a/src/webui/src/components/TrialsDetail.tsx b/src/webui/src/components/TrialsDetail.tsx index e408210a1e..d04c5a3e1d 100644 --- a/src/webui/src/components/TrialsDetail.tsx +++ b/src/webui/src/components/TrialsDetail.tsx @@ -142,53 +142,54 @@ class TrialsDetail extends React.Component {/* trial table list */} - - {tableListIcon} - Trial jobs - - - - { if (this.tableList) { this.tableList.compareBtn(); } }} - /> - - - +
+ + {tableListIcon} + Trial jobs + + + { if (this.tableList) { this.tableList.addColumn(); } }} + text="Compare" + className="allList-compare" + // use child-component tableList's function, the function is in child-component. + onClick={(): void => { if (this.tableList) { this.tableList.compareBtn(); } }} /> - - (this.searchInput) = text} - /> - - - - - trial.tableRecord)} - columnList={columnList} - changeColumn={changeColumn} - trialsUpdateBroadcast={this.props.trialsUpdateBroadcast} - // TODO: change any to specific type - ref={(tabList): any => this.tableList = tabList} - /> + + + + { if (this.tableList) { this.tableList.addColumn(); } }} + /> + + (this.searchInput) = text} + /> + + + + trial.tableRecord)} + columnList={columnList} + changeColumn={changeColumn} + trialsUpdateBroadcast={this.props.trialsUpdateBroadcast} + // TODO: change any to specific type + ref={(tabList): any => this.tableList = tabList} + /> +
); } diff --git a/src/webui/src/components/overview/BasicInfo.tsx b/src/webui/src/components/overview/BasicInfo.tsx index ebd618b914..63977386a9 100644 --- a/src/webui/src/components/overview/BasicInfo.tsx +++ b/src/webui/src/components/overview/BasicInfo.tsx @@ -22,12 +22,16 @@ class BasicInfo extends React.Component {

Name

{EXPERIMENT.profile.params.experimentName}
+
+

ID

{EXPERIMENT.profile.id}

Start time

{formatTimestamp(EXPERIMENT.profile.startTime)}
+
+

End time

{formatTimestamp(EXPERIMENT.profile.endTime)}
@@ -45,6 +49,8 @@ class BasicInfo extends React.Component { {EXPERIMENT.profile.logDir || 'unknown'} + +

Training platform

{EXPERIMENT.profile.params.trainingServicePlatform}
diff --git a/src/webui/src/components/overview/Title1.tsx b/src/webui/src/components/overview/Title1.tsx index 3efab2a2b1..7cf309b2fe 100644 --- a/src/webui/src/components/overview/Title1.tsx +++ b/src/webui/src/components/overview/Title1.tsx @@ -4,7 +4,7 @@ import '../../static/style/overviewTitle.scss'; interface Title1Props { text: string; icon?: string; - bgcolor?: string; + fontColor?: string; } class Title1 extends React.Component { @@ -14,11 +14,11 @@ class Title1 extends React.Component { } render(): React.ReactNode { - const { text, icon, bgcolor } = this.props; + const { text, icon, fontColor } = this.props; return ( - + icon - {text} + {text} ); } diff --git a/src/webui/src/static/style/overview.scss b/src/webui/src/static/style/overview.scss index 6277bb1a66..f636424fdd 100644 --- a/src/webui/src/static/style/overview.scss +++ b/src/webui/src/static/style/overview.scss @@ -25,7 +25,6 @@ padding: 15px 0; color: #212121; width: 95%; - margin: 0 auto; } .nowrap{ diff --git a/src/webui/src/static/style/overviewTitle.scss b/src/webui/src/static/style/overviewTitle.scss index 84ae47bf38..ecb435225e 100644 --- a/src/webui/src/static/style/overviewTitle.scss +++ b/src/webui/src/static/style/overviewTitle.scss @@ -1,18 +1,12 @@ -$titleBgcolor: #b3b3b3; $iconPaddingVal: 14px; -.overview .overviewBoder{ - height: 100%; - border-right: 2px solid white; -} - .panelTitle{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; width: 100%; height: 38px; - padding: 0 $iconPaddingVal; - background: $titleBgcolor; - + padding: 4px $iconPaddingVal; + box-sizing: border-box; + img{ height: 22px; /* (38 - 22 ) / 2 */ @@ -40,7 +34,6 @@ $iconPaddingVal: 14px; } .top10bg{ - background-color: $titleBgcolor; .top10Title{ width: 160px; diff --git a/src/webui/src/static/style/trialsDetail.scss b/src/webui/src/static/style/trialsDetail.scss index d3ae47cb42..ab74fa017f 100644 --- a/src/webui/src/static/style/trialsDetail.scss +++ b/src/webui/src/static/style/trialsDetail.scss @@ -1,5 +1,5 @@ -$bg: #b3b3b3; #tabsty{ + background-color: #fff; .ms-Pivot{ .ms-Button{ padding: 0; @@ -7,13 +7,6 @@ $bg: #b3b3b3; margin-right: 0; border-right: 2px solid #fff; transition: 0.3s; - - &:hover{ - background-color: $bg; - } - .ms-Button-flexContainer{ - background-color: $bg; - } } .ms-Pivot-link::before{ @@ -45,10 +38,7 @@ $bg: #b3b3b3; /* graph, title total height */ width: 100%; height: 500px; - /* graph all title bg*/ - .ms-FocusZone{ - background-color: $bg; - } + .graph{ height: 432px; margin: 0 auto; From bf7daa8fabbc4a8ba24df7b0e9822f8035ab0429 Mon Sep 17 00:00:00 2001 From: Yuge Zhang Date: Mon, 11 May 2020 17:22:40 +0800 Subject: [PATCH 8/8] Prettify the export format of NAS trainer (#2389) --- docs/en_US/NAS/NasGuide.md | 17 ++++- src/sdk/pynni/nni/nas/pytorch/fixed.py | 51 +++++++++----- src/sdk/pynni/nni/nas/pytorch/mutator.py | 85 +++++++++++++++++++++--- src/sdk/pynni/nni/nas/pytorch/utils.py | 11 +++ 4 files changed, 135 insertions(+), 29 deletions(-) diff --git a/docs/en_US/NAS/NasGuide.md b/docs/en_US/NAS/NasGuide.md index 6773d28e14..35a2fda442 100644 --- a/docs/en_US/NAS/NasGuide.md +++ b/docs/en_US/NAS/NasGuide.md @@ -156,12 +156,23 @@ model = Net() apply_fixed_architecture(model, "model_dir/final_architecture.json") ``` -The JSON is simply a mapping from mutable keys to one-hot or multi-hot representation of choices. For example +The JSON is simply a mapping from mutable keys to choices. Choices can be expressed in: + +* A string: select the candidate with corresponding name. +* A number: select the candidate with corresponding index. +* A list of string: select the candidates with corresponding names. +* A list of number: select the candidates with corresponding indices. +* A list of boolean values: a multi-hot array. + +For example, ```json { - "LayerChoice1": [false, true, false, false], - "InputChoice2": [true, true, false] + "LayerChoice1": "conv5x5", + "LayerChoice2": 6, + "InputChoice3": ["layer1", "layer3"], + "InputChoice4": [1, 2], + "InputChoice5": [false, true, false, false, true] } ``` diff --git a/src/sdk/pynni/nni/nas/pytorch/fixed.py b/src/sdk/pynni/nni/nas/pytorch/fixed.py index 0be4e0ea79..106368128c 100644 --- a/src/sdk/pynni/nni/nas/pytorch/fixed.py +++ b/src/sdk/pynni/nni/nas/pytorch/fixed.py @@ -3,10 +3,9 @@ import json -import torch - -from nni.nas.pytorch.mutables import MutableScope -from nni.nas.pytorch.mutator import Mutator +from .mutables import InputChoice, LayerChoice, MutableScope +from .mutator import Mutator +from .utils import to_list class FixedArchitecture(Mutator): @@ -17,8 +16,8 @@ class FixedArchitecture(Mutator): ---------- model : nn.Module A mutable network. - fixed_arc : str or dict - Path to the architecture checkpoint (a string), or preloaded architecture object (a dict). + fixed_arc : dict + Preloaded architecture object. strict : bool Force everything that appears in ``fixed_arc`` to be used at least once. """ @@ -33,6 +32,34 @@ def __init__(self, model, fixed_arc, strict=True): raise RuntimeError("Unexpected keys found in fixed architecture: {}.".format(fixed_arc_keys - mutable_keys)) if mutable_keys - fixed_arc_keys: raise RuntimeError("Missing keys in fixed architecture: {}.".format(mutable_keys - fixed_arc_keys)) + self._fixed_arc = self._from_human_readable_architecture(self._fixed_arc) + + def _from_human_readable_architecture(self, human_arc): + # convert from an exported architecture + result_arc = {k: to_list(v) for k, v in human_arc.items()} # there could be tensors, numpy arrays, etc. + # First, convert non-list to list, because there could be {"op1": 0} or {"op1": "conv"}, + # which means {"op1": [0, ]} ir {"op1": ["conv", ]} + result_arc = {k: v if isinstance(v, list) else [v] for k, v in result_arc.items()} + # Second, infer which ones are multi-hot arrays and which ones are in human-readable format. + # This is non-trivial, since if an array in [0, 1], we cannot know for sure it means [false, true] or [true, true]. + # Here, we assume an multihot array has to be a boolean array or a float array and matches the length. + for mutable in self.mutables: + if mutable.key not in result_arc: + continue # skip silently + choice_arr = result_arc[mutable.key] + if all(isinstance(v, bool) for v in choice_arr) or all(isinstance(v, float) for v in choice_arr): + if (isinstance(mutable, LayerChoice) and len(mutable) == len(choice_arr)) or \ + (isinstance(mutable, InputChoice) and mutable.n_candidates == len(choice_arr)): + # multihot, do nothing + continue + if isinstance(mutable, LayerChoice): + choice_arr = [mutable.names.index(val) if isinstance(val, str) else val for val in choice_arr] + choice_arr = [i in choice_arr for i in range(len(mutable))] + elif isinstance(mutable, InputChoice): + choice_arr = [mutable.choose_from.index(val) if isinstance(val, str) else val for val in choice_arr] + choice_arr = [i in choice_arr for i in range(mutable.n_candidates)] + result_arc[mutable.key] = choice_arr + return result_arc def sample_search(self): """ @@ -47,17 +74,6 @@ def sample_final(self): return self._fixed_arc -def _encode_tensor(data): - if isinstance(data, list): - if all(map(lambda o: isinstance(o, bool), data)): - return torch.tensor(data, dtype=torch.bool) # pylint: disable=not-callable - else: - return torch.tensor(data, dtype=torch.float) # pylint: disable=not-callable - if isinstance(data, dict): - return {k: _encode_tensor(v) for k, v in data.items()} - return data - - def apply_fixed_architecture(model, fixed_arc): """ Load architecture from `fixed_arc` and apply to model. @@ -78,7 +94,6 @@ def apply_fixed_architecture(model, fixed_arc): if isinstance(fixed_arc, str): with open(fixed_arc) as f: fixed_arc = json.load(f) - fixed_arc = _encode_tensor(fixed_arc) architecture = FixedArchitecture(model, fixed_arc) architecture.reset() return architecture diff --git a/src/sdk/pynni/nni/nas/pytorch/mutator.py b/src/sdk/pynni/nni/nas/pytorch/mutator.py index 160a20de84..e9cc68857a 100644 --- a/src/sdk/pynni/nni/nas/pytorch/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/mutator.py @@ -7,7 +7,9 @@ import numpy as np import torch -from nni.nas.pytorch.base_mutator import BaseMutator +from .base_mutator import BaseMutator +from .mutables import LayerChoice, InputChoice +from .utils import to_list logger = logging.getLogger(__name__) @@ -58,7 +60,16 @@ def export(self): dict A mapping from key of mutables to decisions. """ - return self.sample_final() + sampled = self.sample_final() + result = dict() + for mutable in self.mutables: + if not isinstance(mutable, (LayerChoice, InputChoice)): + # not supported as built-in + continue + result[mutable.key] = self._convert_mutable_decision_to_human_readable(mutable, sampled.pop(mutable.key)) + if sampled: + raise ValueError("Unexpected keys returned from 'sample_final()': %s", list(sampled.keys())) + return result def status(self): """ @@ -159,7 +170,7 @@ def _map_fn(op, args, kwargs): mask = self._get_decision(mutable) assert len(mask) == len(mutable), \ "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable)) - out = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask) + out, mask = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask) return self._tensor_reduction(mutable.reduction, out), mask def on_forward_input_choice(self, mutable, tensor_list): @@ -185,17 +196,41 @@ def on_forward_input_choice(self, mutable, tensor_list): mask = self._get_decision(mutable) assert len(mask) == mutable.n_candidates, \ "Invalid mask, expected {} to be of length {}.".format(mask, mutable.n_candidates) - out = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask) + out, mask = self._select_with_mask(lambda x: x, [(t,) for t in tensor_list], mask) return self._tensor_reduction(mutable.reduction, out), mask def _select_with_mask(self, map_fn, candidates, mask): - if "BoolTensor" in mask.type(): + """ + Select masked tensors and return a list of tensors. + + Parameters + ---------- + map_fn : function + Convert candidates to target candidates. Can be simply identity. + candidates : list of torch.Tensor + Tensor list to apply the decision on. + mask : list-like object + Can be a list, an numpy array or a tensor (recommended). Needs to + have the same length as ``candidates``. + + Returns + ------- + tuple of list of torch.Tensor and torch.Tensor + Output and mask. + """ + if (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], bool)) or \ + (isinstance(mask, np.ndarray) and mask.dtype == np.bool) or \ + "BoolTensor" in mask.type(): out = [map_fn(*cand) for cand, m in zip(candidates, mask) if m] - elif "FloatTensor" in mask.type(): + elif (isinstance(mask, list) and len(mask) >= 1 and isinstance(mask[0], (float, int))) or \ + (isinstance(mask, np.ndarray) and mask.dtype in (np.float32, np.float64, np.int32, np.int64)) or \ + "FloatTensor" in mask.type(): out = [map_fn(*cand) * m for cand, m in zip(candidates, mask) if m] else: - raise ValueError("Unrecognized mask") - return out + raise ValueError("Unrecognized mask '%s'" % mask) + if not torch.is_tensor(mask): + mask = torch.tensor(mask) # pylint: disable=not-callable + return out, mask def _tensor_reduction(self, reduction_type, tensor_list): if reduction_type == "none": @@ -237,3 +272,37 @@ def _get_decision(self, mutable): result = self._cache[mutable.key] logger.debug("Decision %s: %s", mutable.key, result) return result + + def _convert_mutable_decision_to_human_readable(self, mutable, sampled): + # Assert the existence of mutable.key in returned architecture. + # Also check if there is anything extra. + multihot_list = to_list(sampled) + converted = None + # If it's a boolean array, we can do optimization. + if all([t == 0 or t == 1 for t in multihot_list]): + if isinstance(mutable, LayerChoice): + assert len(multihot_list) == len(mutable), \ + "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ + % (mutable.key, multihot_list) + # check if all modules have different names and they indeed have names + if len(set(mutable.names)) == len(mutable) and not all(d.isdigit() for d in mutable.names): + converted = [name for i, name in enumerate(mutable.names) if multihot_list[i]] + else: + converted = [i for i in range(len(multihot_list)) if multihot_list[i]] + if isinstance(mutable, InputChoice): + assert len(multihot_list) == mutable.n_candidates, \ + "Results returned from 'sample_final()' (%s: %s) either too short or too long." \ + % (mutable.key, multihot_list) + # check if all input candidates have different names + if len(set(mutable.choose_from)) == mutable.n_candidates: + converted = [name for i, name in enumerate(mutable.choose_from) if multihot_list[i]] + else: + converted = [i for i in range(len(multihot_list)) if multihot_list[i]] + if converted is not None: + # if only one element, then remove the bracket + if len(converted) == 1: + converted = converted[0] + else: + # do nothing + converted = multihot_list + return converted diff --git a/src/sdk/pynni/nni/nas/pytorch/utils.py b/src/sdk/pynni/nni/nas/pytorch/utils.py index 7536740eb3..a3f5aabfb7 100644 --- a/src/sdk/pynni/nni/nas/pytorch/utils.py +++ b/src/sdk/pynni/nni/nas/pytorch/utils.py @@ -4,6 +4,7 @@ import logging from collections import OrderedDict +import numpy as np import torch _counter = 0 @@ -45,6 +46,16 @@ def to_device(obj, device): raise ValueError("'%s' has unsupported type '%s'" % (obj, type(obj))) +def to_list(arr): + if torch.is_tensor(arr): + return arr.cpu().numpy().tolist() + if isinstance(arr, np.ndarray): + return arr.tolist() + if isinstance(arr, (list, tuple)): + return list(arr) + return arr + + class AverageMeterGroup: """ Average meter group for multiple average meters.