diff --git a/Makefile b/Makefile index bba531ebda..829c68bd3f 100644 --- a/Makefile +++ b/Makefile @@ -173,12 +173,12 @@ install-python-modules: dev-install-python-modules: #$(_INFO) Installing Python SDK $(_END) mkdir -p build - ln -sf ../src/sdk/pynni/nni build - ln -sf ../src/sdk/pycli/nnicli build - ln -sf ../tools/nni_annotation build - ln -sf ../tools/nni_cmd build - ln -sf ../tools/nni_trial_tool build - ln -sf ../tools/nni_gpu_tool build + ln -sfT ../src/sdk/pynni/nni build/nni + ln -sfT ../src/sdk/pycli/nnicli build/nnicli + ln -sfT ../tools/nni_annotation build/nni_annotation + ln -sfT ../tools/nni_cmd build/nni_cmd + ln -sfT ../tools/nni_trial_tool build/nni_trial_tool + ln -sfT ../tools/nni_gpu_tool build/nni_gpu_tool cp setup.py build/ cp README.md build/ sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py @@ -205,16 +205,14 @@ install-node-modules: .PHONY: dev-install-node-modules dev-install-node-modules: #$(_INFO) Installing NNI Package $(_END) - rm -rf $(NNI_PKG_FOLDER) - ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER) + ln -sfT ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER) cp src/nni_manager/package.json $(NNI_PKG_FOLDER) sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json - ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER) - ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER) - mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static + ln -sfT ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules + ln -sfT ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static mkdir -p $(NASUI_PKG_FOLDER) - ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER) - ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER) + ln -sfT ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build + ln -sfT ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js .PHONY: install-scripts install-scripts: diff --git a/README.md b/README.md index 3edc61e374..d87ce06e1f 100644 --- a/README.md +++ b/README.md @@ -342,7 +342,7 @@ With authors' permission, we listed a set of NNI usage examples and relevant art Join IM discussion groups: |Gitter||WeChat| |----|----|----| -|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://user-images.githubusercontent.com/39592018/80665762-f06f2a00-8acc-11ea-8d22-e461e68e2d9b.png)| +|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://github.com/JSong-Jia/NNI-user-group/blob/master/NNI%20user%20group_3.png)| ## Related Projects diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 6f5068d320..c608cc970a 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -92,8 +92,18 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod * Required key. Set the mount path in your container used in PAI. * paiStoragePlugin * Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. +* command + * Optional key. Set the commands used in PAI container. * paiConfigPath * Optional key. Set the file path of pai job configuration, the file is in yaml format. + If users set `paiConfigPath` in NNI's configuration file, no need to specify the fields `command`, `paiStoragePlugin`, `virtualCluster`, `image`, `memoryMB`, `cpuNum`, `gpuNum` in `trial` configuration. These fields will use the values from the config file specified by `paiConfigPath`. + ``` + Note: + 1. The job name in PAI's configuration file will be replaced by a new job name, the new job name is created by NNI, the name format is nni_exp_${this.experimentId}_trial_${trialJobId}. + + 2. If users set multiple taskRoles in PAI's configuration file, NNI will wrap all of these taksRoles and start multiple tasks in one trial job, users should ensure that only one taskRole report metric to NNI, otherwise there might be some conflict error. + + ``` Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command diff --git a/src/nni_manager/.eslintrc b/src/nni_manager/.eslintrc index 12856c87f9..350ff7e0ad 100644 --- a/src/nni_manager/.eslintrc +++ b/src/nni_manager/.eslintrc @@ -23,6 +23,13 @@ "@typescript-eslint/consistent-type-assertions": 0, "@typescript-eslint/no-inferrable-types": 0, "no-inner-declarations": 0, + "@typescript-eslint/explicit-function-return-type": "error", + "@typescript-eslint/no-unused-vars": [ + "error", + { + "argsIgnorePattern": "^_" + } + ], "@typescript-eslint/no-var-requires": 0 }, "ignorePatterns": [ diff --git a/src/nni_manager/core/sqlDatabase.ts b/src/nni_manager/core/sqlDatabase.ts index 125a1aff6d..485ba75a33 100644 --- a/src/nni_manager/core/sqlDatabase.ts +++ b/src/nni_manager/core/sqlDatabase.ts @@ -98,7 +98,7 @@ class SqlDB implements Database { this.resolve(this.initTask, err); } else { if (createNew) { - this.db.exec(createTables, (error: Error | null) => { + this.db.exec(createTables, (_error: Error | null) => { this.resolve(this.initTask, err); }); } else { diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 3e89f675b2..34aa0b0121 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -14,7 +14,6 @@ "azure-storage": "^2.10.2", "chai-as-promised": "^7.1.1", "child-process-promise": "^2.2.1", - "deepmerge": "^4.2.2", "express": "^4.16.3", "express-joi-validator": "^2.0.0", "js-base64": "^2.4.9", diff --git a/src/nni_manager/rest_server/restHandler.ts b/src/nni_manager/rest_server/restHandler.ts index 2d8494cb36..457f154b69 100644 --- a/src/nni_manager/rest_server/restHandler.ts +++ b/src/nni_manager/rest_server/restHandler.ts @@ -60,7 +60,7 @@ class NNIRestHandler { this.exportData(router); // Express-joi-validator configuration - router.use((err: any, req: Request, res: Response, next: any) => { + router.use((err: any, _req: Request, res: Response, _next: any) => { if (err.isBoom) { this.log.error(err.output.payload); diff --git a/src/nni_manager/training_service/dlts/dltsTrainingService.ts b/src/nni_manager/training_service/dlts/dltsTrainingService.ts index e2e5868c46..ba707fbb13 100644 --- a/src/nni_manager/training_service/dlts/dltsTrainingService.ts +++ b/src/nni_manager/training_service/dlts/dltsTrainingService.ts @@ -131,7 +131,7 @@ class DLTSTrainingService implements TrainingService { private async statusCheckingLoop(): Promise { while (!this.stopping) { const updateDLTSTrialJobs: Promise[] = []; - for (const [trialJobId, dltsTrialJob] of this.trialJobsMap) { + for (const dltsTrialJob of this.trialJobsMap.values()) { updateDLTSTrialJobs.push(this.getDLTSTrialJobInfo(dltsTrialJob)); } @@ -405,7 +405,7 @@ class DLTSTrainingService implements TrainingService { } } - public async getClusterMetadata(key: string): Promise { + public async getClusterMetadata(_key: string): Promise { return ''; } @@ -545,7 +545,7 @@ class DLTSTrainingService implements TrainingService { body: parameterFileMeta }; await new Promise((resolve, reject) => { - request(req, (err: Error, res: request.Response) => { + request(req, (err: Error, _res: request.Response) => { if (err) { reject(err); } else { diff --git a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts index d0858189f5..fb393d186c 100644 --- a/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts +++ b/src/nni_manager/training_service/kubernetes/azureStorageClientUtils.ts @@ -20,7 +20,7 @@ export namespace AzureStorageClientUtility { */ export async function createShare(fileServerClient: any, azureShare: any): Promise { const deferred: Deferred = new Deferred(); - fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => { + fileServerClient.createShareIfNotExists(azureShare, (error: any, _result: any, _response: any) => { if (error) { getLogger() .error(`Create share failed:, ${error}`); @@ -41,7 +41,7 @@ export namespace AzureStorageClientUtility { */ export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise { const deferred: Deferred = new Deferred(); - fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => { + fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, _result: any, _response: any) => { if (error) { getLogger() .error(`Create directory failed:, ${error}`); @@ -89,7 +89,7 @@ export namespace AzureStorageClientUtility { localFilePath: string): Promise { const deferred: Deferred = new Deferred(); await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, - (error: any, result: any, response: any) => { + (error: any, _result: any, _response: any) => { if (error) { getLogger() .error(`Upload file failed:, ${error}`); @@ -114,7 +114,7 @@ export namespace AzureStorageClientUtility { localFilePath: string): Promise { const deferred: Deferred = new Deferred(); await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), - (error: any, result: any, response: any) => { + (error: any, _result: any, _response: any) => { if (error) { getLogger() .error(`Download file failed:, ${error}`); @@ -183,7 +183,7 @@ export namespace AzureStorageClientUtility { const deferred: Deferred = new Deferred(); await mkDirP(localDirectory); fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null', - async (error: any, result: any, response: any) => { + async (_error: any, result: any, _response: any) => { if (('entries' in result) === false) { getLogger() .error(`list files failed, can't get entries in result`); diff --git a/src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts b/src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts index 7a54b93f27..129bd7ba5b 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesJobInfoCollector.ts @@ -40,8 +40,8 @@ export class KubernetesJobInfoCollector { await Promise.all(updateKubernetesTrialJobs); } - protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, - kubernetesTrialJob: KubernetesTrialJobDetail): Promise { + protected async retrieveSingleTrialJobInfo(_kubernetesCRDClient: KubernetesCRDClient | undefined, + _kubernetesTrialJob: KubernetesTrialJobDetail): Promise { throw new MethodNotImplementedError(); } } diff --git a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts index e13bd75d51..56870fac97 100644 --- a/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts +++ b/src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts @@ -77,7 +77,7 @@ abstract class KubernetesTrainingService { public async listTrialJobs(): Promise { const jobs: TrialJobDetail[] = []; - for (const [key, value] of this.trialJobsMap) { + for (const key of this.trialJobsMap.keys()) { jobs.push(await this.getTrialJob(key)); } @@ -107,7 +107,7 @@ abstract class KubernetesTrainingService { return false; } - public getClusterMetadata(key: string): Promise { + public getClusterMetadata(_key: string): Promise { return Promise.resolve(''); } diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index d6c405b0aa..61742f378a 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -3,7 +3,6 @@ 'use strict'; -import {TrialConfig} from '../common/trialConfig'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; export class PAIClusterConfig { diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index a419be5c95..eb15765a4f 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -62,7 +62,7 @@ export class PAIJobInfoCollector { }; //TODO : pass in request timeout param? - request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => { + request(getJobInfoRequest, (error: Error, response: request.Response, _body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 500) { this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); // Queried PAI job info failed, set job status to UNKNOWN diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 38087f574c..00e960cdce 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -4,8 +4,6 @@ 'use strict'; import { Request, Response, Router } from 'express'; -import { Inject } from 'typescript-ioc'; -import * as component from '../../common/component'; import { ClusterJobRestServer } from '../common/clusterJobRestServer'; import { PAITrainingService } from './paiTrainingService'; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 174083405d..48737ead35 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -19,7 +19,6 @@ 'use strict'; -import * as cpp from 'child-process-promise'; import * as fs from 'fs'; import * as path from 'path'; // tslint:disable-next-line:no-implicit-dependencies @@ -29,11 +28,13 @@ import * as component from '../../../common/component'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; import { - HyperParameters, NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + HyperParameters, NNIManagerIpConfig, + TrialJobApplicationForm, TrialJobDetail } from '../../../common/trainingService'; -import { delay, generateParamFileName, - getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils'; +import { + generateParamFileName, + getIPV4Address, getVersion, uniqueString +} from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir, execCopydir } from '../../common/util'; @@ -44,7 +45,6 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; import { PAIJobRestServer } from '../paiJobRestServer'; const yaml = require('js-yaml'); -const deepmerge = require('deepmerge'); /** * Training Service implementation for OpenPAI (Open Platform for AI) @@ -53,9 +53,11 @@ const deepmerge = require('deepmerge'); @component.Singleton class PAIK8STrainingService extends PAITrainingService { protected paiTrialConfig: NNIPAIK8STrialConfig | undefined; - + private paiJobConfig: undefined; + private nniVersion: string | undefined; constructor() { super(); + } public async setClusterMetadata(key: string, value: string): Promise { @@ -68,10 +70,10 @@ class PAIK8STrainingService extends PAITrainingService { this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService)); this.paiClusterConfig = JSON.parse(value); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if(this.paiClusterConfig.passWord) { + if (this.paiClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiClusterConfig.token) { + } else if (this.paiClusterConfig.token) { this.paiToken = this.paiClusterConfig.token; } break; @@ -84,9 +86,13 @@ class PAIK8STrainingService extends PAITrainingService { this.paiTrialConfig = JSON.parse(value); // Validate to make sure codeDir doesn't have too many files await validateCodeDir(this.paiTrialConfig.codeDir); + if (this.paiTrialConfig.paiConfigPath) { + this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); + } break; case TrialConfigMetadataKey.VERSION_CHECK: this.versionCheck = (value === 'true' || value === 'True'); + this.nniVersion = this.versionCheck ? await getVersion() : ''; break; case TrialConfigMetadataKey.LOG_COLLECTION: this.logCollection = value; @@ -99,7 +105,7 @@ class PAIK8STrainingService extends PAITrainingService { this.log.error(`Uknown key: ${key}`); } } - + // update trial parameters for multi-phase public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); @@ -142,71 +148,99 @@ class PAIK8STrainingService extends PAITrainingService { return trialJobDetail; } - public generateJobConfigInYamlFormat(trialJobId: string, command: string) { + private generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): string { if (this.paiTrialConfig === undefined) { throw new Error('trial config is not initialized'); } - const jobName = `nni_exp_${this.experimentId}_trial_${trialJobId}` - const paiJobConfig: any = { - protocolVersion: 2, - name: jobName, - type: 'job', - jobRetryCount: 0, - prerequisites: [ - { - type: 'dockerimage', - uri: this.paiTrialConfig.image, - name: 'docker_image_0' - } - ], - taskRoles: { - taskrole: { - instances: 1, - completion: { - minFailedInstances: 1, - minSucceededInstances: -1 - }, - taskRetryCount: 0, - dockerImage: 'docker_image_0', - resourcePerInstance: { - gpu: this.paiTrialConfig.gpuNum, - cpu: this.paiTrialConfig.cpuNum, - memoryMB: this.paiTrialConfig.memoryMB - }, - commands: [ - command - ] - } - }, - extras: { - 'com.microsoft.pai.runtimeplugin': [ - { - plugin: this.paiTrialConfig.paiStoragePlugin - } - ], - submitFrom: 'submit-job-v2' - } - } - if (this.paiTrialConfig.virtualCluster) { - paiJobConfig.defaults= { - virtualCluster: this.paiTrialConfig.virtualCluster - } + const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobDetail.id}`; + const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); + const nniPaiTrialCommand: string = String.Format( + PAI_K8S_TRIAL_COMMAND_FORMAT, + `${containerWorkingDir}`, + `${containerWorkingDir}/nnioutput`, + trialJobDetail.id, + this.experimentId, + trialJobDetail.form.sequenceId, + this.isMultiPhase, + command, + nniManagerIp, + this.paiRestServerPort, + this.nniVersion, + this.logCollection + ) + .replace(/\r\n|\n|\r/gm, ''); + + return nniPaiTrialCommand; + + } + + private generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): any { + if (this.paiTrialConfig === undefined) { + throw new Error('trial config is not initialized'); } + const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}` + let nniJobConfig: any = undefined; if (this.paiTrialConfig.paiConfigPath) { - try { - const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); - //deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result. - //refer: https://github.com/TehShrike/deepmerge - const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray; - return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge })); - } catch (error) { - this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`); + nniJobConfig = this.paiJobConfig; + nniJobConfig.name = jobName; + // Each taskRole will generate new command in NNI's command format + // Each command will be formatted to NNI style + for (const taskRoleIndex in nniJobConfig.taskRoles) { + const commands = nniJobConfig.taskRoles[taskRoleIndex].commands + const nniTrialCommand = this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')); + nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand] } + } else { - return yaml.safeDump(paiJobConfig); + nniJobConfig = { + protocolVersion: 2, + name: jobName, + type: 'job', + jobRetryCount: 0, + prerequisites: [ + { + type: 'dockerimage', + uri: this.paiTrialConfig.image, + name: 'docker_image_0' + } + ], + taskRoles: { + taskrole: { + instances: 1, + completion: { + minFailedInstances: 1, + minSucceededInstances: -1 + }, + taskRetryCount: 0, + dockerImage: 'docker_image_0', + resourcePerInstance: { + gpu: this.paiTrialConfig.gpuNum, + cpu: this.paiTrialConfig.cpuNum, + memoryMB: this.paiTrialConfig.memoryMB + }, + commands: [ + this.generateNNITrialCommand(trialJobDetail, this.paiTrialConfig.command) + ] + } + }, + extras: { + 'com.microsoft.pai.runtimeplugin': [ + { + plugin: this.paiTrialConfig.paiStoragePlugin + } + ], + submitFrom: 'submit-job-v2' + } + } + if (this.paiTrialConfig.virtualCluster) { + nniJobConfig.defaults = { + virtualCluster: this.paiTrialConfig.virtualCluster + } + } } - } + return yaml.safeDump(nniJobConfig); + } protected async submitTrialJobToPAI(trialJobId: string): Promise { const deferred: Deferred = new Deferred(); @@ -247,29 +281,8 @@ class PAIK8STrainingService extends PAITrainingService { //Copy codeDir files to local working folder await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath); - - const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); - const version: string = this.versionCheck ? await getVersion() : ''; - const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobId}`; - const nniPaiTrialCommand: string = String.Format( - PAI_K8S_TRIAL_COMMAND_FORMAT, - `${containerWorkingDir}`, - `${containerWorkingDir}/nnioutput`, - trialJobId, - this.experimentId, - trialJobDetail.form.sequenceId, - this.isMultiPhase, - this.paiTrialConfig.command, - nniManagerIp, - this.paiRestServerPort, - version, - this.logCollection - ) - .replace(/\r\n|\n|\r/gm, ''); - - this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); - - const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand); + //Generate Job Configuration in yaml format + const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); this.log.debug(paiJobConfig); // Step 3. Submit PAI job via Rest call // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 4f44dad8c6..e26c16ecee 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -3,27 +3,21 @@ 'use strict'; -import * as fs from 'fs'; import * as path from 'path'; import * as request from 'request'; import * as component from '../../common/component'; import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; -import { String } from 'typescript-string-operations'; import { getExperimentId } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { - HyperParameters, NNIManagerIpConfig, TrainingService, + NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; -import { delay, generateParamFileName, - getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; -import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { execMkdir, validateCodeDir } from '../common/util'; +import { delay } from '../../common/utils'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; -import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer'; +import { PAIJobRestServer } from './paiJobRestServer'; import { PAIClusterConfig, PAITrialJobDetail } from './paiConfig'; /** @@ -39,7 +33,7 @@ abstract class PAITrainingService implements TrainingService { protected paiClusterConfig?: PAIClusterConfig; protected readonly jobQueue: string[]; protected stopping: boolean = false; - protected paiToken? : string; + protected paiToken?: string; protected paiTokenUpdateTime?: number; protected readonly paiTokenUpdateInterval: number; protected readonly experimentId!: string; @@ -81,15 +75,15 @@ abstract class PAITrainingService implements TrainingService { this.log.info('PAI training service exit.'); } - public async submitTrialJob(form: TrialJobApplicationForm): Promise { + public async submitTrialJob(_form: TrialJobApplicationForm): Promise { throw new Error('Not implemented!'); } - public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + public async updateTrialJob(_trialJobId: string, _form: TrialJobApplicationForm): Promise { throw new Error('Not implemented!'); } - protected async submitTrialJobToPAI(trialJobId: string): Promise { + protected async submitTrialJobToPAI(_trialJobId: string): Promise { throw new Error('Not implemented!'); } @@ -109,14 +103,14 @@ abstract class PAITrainingService implements TrainingService { } } - public async setClusterMetadata(key: string, value: string): Promise { + public async setClusterMetadata(_key: string, _value: string): Promise { throw new Error('Not implemented!'); } public async listTrialJobs(): Promise { const jobs: TrialJobDetail[] = []; - for (const [key, value] of this.trialJobsMap) { + for (const key of this.trialJobsMap.keys()) { jobs.push(await this.getTrialJob(key)); } @@ -150,7 +144,7 @@ abstract class PAITrainingService implements TrainingService { } public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { - const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); if (trialJobDetail === undefined) { return Promise.reject(new Error(`cancelTrialJob: trial job id ${trialJobId} not found`)); } @@ -169,10 +163,10 @@ abstract class PAITrainingService implements TrainingService { const stopJobRequest: request.Options = { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\ -/jobs/${trialJobDetail.paiJobName}/executionType`, +/jobs/${trialJobDetail.paiJobName}/executionType`, method: 'PUT', json: true, - body: {value: 'STOP'}, + body: { value: 'STOP' }, headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${this.paiToken}` @@ -183,11 +177,11 @@ abstract class PAITrainingService implements TrainingService { trialJobDetail.isEarlyStopped = isEarlyStopped; const deferred: Deferred = new Deferred(); - request(stopJobRequest, (error: Error, response: request.Response, body: any) => { + request(stopJobRequest, (error: Error, response: request.Response, _body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); deferred.reject((error !== undefined && error !== null) ? error.message : - `Stop trial failed, http code: ${response.statusCode}`); + `Stop trial failed, http code: ${response.statusCode}`); } else { deferred.resolve(); } @@ -196,7 +190,7 @@ abstract class PAITrainingService implements TrainingService { return deferred.promise; } - public getClusterMetadata(key: string): Promise { + public getClusterMetadata(_key: string): Promise { throw new Error('Not implemented!'); } @@ -236,7 +230,7 @@ abstract class PAITrainingService implements TrainingService { protected async statusCheckingLoop(): Promise { while (!this.stopping) { - if(this.paiClusterConfig && this.paiClusterConfig.passWord) { + if (this.paiClusterConfig && this.paiClusterConfig.passWord) { try { await this.updatePaiToken(); } catch (error) { @@ -302,7 +296,7 @@ abstract class PAITrainingService implements TrainingService { }); let timeoutId: NodeJS.Timer; - const timeoutDelay: Promise = new Promise((resolve: Function, reject: Function): void => { + const timeoutDelay: Promise = new Promise((_resolve: Function, reject: Function): void => { // Set timeout and reject the promise once reach timeout (5 seconds) timeoutId = setTimeout( () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts index 3ba015298f..9a2b6b3706 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnData.ts @@ -3,8 +3,6 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; - export const PAI_INSTALL_NNI_SHELL_FORMAT: string = `#!/bin/bash if python3 -c 'import nni' > /dev/null 2>&1; then diff --git a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts index 08038e5b59..80699d0b0e 100644 --- a/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiYarn/paiYarnTrainingService.ts @@ -8,24 +8,22 @@ import * as path from 'path'; import * as request from 'request'; import * as component from '../../../common/component'; -import { EventEmitter } from 'events'; import { Deferred } from 'ts-deferred'; import { String } from 'typescript-string-operations'; -import { getExperimentId } from '../../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../../common/log'; import { - HyperParameters, NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + HyperParameters, NNIManagerIpConfig, + TrialJobApplicationForm, TrialJobDetail } from '../../../common/trainingService'; -import { delay, generateParamFileName, - getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils'; +import { + generateParamFileName, + getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin +} from '../../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { execMkdir, validateCodeDir } from '../../common/util'; import { HDFSClientUtility } from './hdfsClientUtility'; import { NNIPAITrialConfig, PAIJobConfig, PAITaskRole } from './paiYarnConfig'; import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT } from './paiYarnData'; -import { PAIJobInfoCollector } from '../paiJobInfoCollector'; import { PAITrainingService } from '../paiTrainingService'; import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; @@ -65,7 +63,7 @@ class PAIYarnTrainingService extends PAITrainingService { PAI_LOG_PATH_FORMAT, this.paiClusterConfig.host, hdfsOutputDir - ); + ); const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, @@ -99,13 +97,13 @@ class PAIYarnTrainingService extends PAITrainingService { port: 80, path: '/webhdfs/api/v1', host: this.paiClusterConfig.host - + }); this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if(this.paiClusterConfig.passWord) { + if (this.paiClusterConfig.passWord) { // Get PAI authentication token await this.updatePaiToken(); - } else if(this.paiClusterConfig.token) { + } else if (this.paiClusterConfig.token) { this.paiToken = this.paiClusterConfig.token; } else { throw new Error('pai cluster config format error, please set password or token!'); @@ -121,14 +119,14 @@ class PAIYarnTrainingService extends PAITrainingService { // Validate to make sure codeDir doesn't have too many files await validateCodeDir(this.paiTrialConfig.codeDir); - + // Copy experiment files from local folder to HDFS this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( this.paiTrialConfig.codeDir, HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), this.hdfsClient ); - + // Upload authFile to hdfs if (this.paiTrialConfig.authFile) { this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiClusterConfig.userName), 'authFile'); @@ -224,7 +222,7 @@ class PAIYarnTrainingService extends PAITrainingService { version, this.logCollection ) - .replace(/\r\n|\n|\r/gm, ''); + .replace(/\r\n|\n|\r/gm, ''); this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); const paiTaskRoles: PAITaskRole[] = [ @@ -283,10 +281,8 @@ class PAIYarnTrainingService extends PAITrainingService { Authorization: `Bearer ${this.paiToken}` } }; - request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + request(submitJobRequest, (error: Error, response: request.Response, _body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { - const errorMessage: string = (error !== undefined && error !== null) ? error.message : - `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`; trialJobDetail.status = 'FAILED'; deferred.resolve(true); } else { @@ -343,7 +339,7 @@ class PAIYarnTrainingService extends PAITrainingService { json: true, body: parameterFileMeta }; - request(req, (err: Error, res: request.Response) => { + request(req, (err: Error, _res: request.Response) => { if (err) { deferred.reject(err); } else { diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index b614196548..157da50d0c 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -351,7 +351,7 @@ class RemoteMachineTrainingService implements TrainingService { * Get culster metadata * @param key metadata key */ - public async getClusterMetadata(key: string): Promise { + public async getClusterMetadata(_key: string): Promise { return ""; } diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index 66e94aa8d1..ab14b07403 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -1332,11 +1332,6 @@ deepmerge@^2.1.1: version "2.2.1" resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170" -deepmerge@^4.2.2: - version "4.2.2" - resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955" - integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg== - default-require-extensions@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-3.0.0.tgz#e03f93aac9b2b6443fc52e5e4a37b3ad9ad8df96" diff --git a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py index b6d0d9b7ba..ee110dd5d1 100644 --- a/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/classic_nas/mutator.py @@ -92,8 +92,8 @@ def _sample_layer_choice(self, mutable, idx, value, search_space_item): The list for corresponding search space. """ # doesn't support multihot for layer choice yet - onehot_list = [False] * mutable.length - assert 0 <= idx < mutable.length and search_space_item[idx] == value, \ + onehot_list = [False] * len(mutable) + assert 0 <= idx < len(mutable) and search_space_item[idx] == value, \ "Index '{}' in search space '{}' is not '{}'".format(idx, search_space_item, value) onehot_list[idx] = True return torch.tensor(onehot_list, dtype=torch.bool) # pylint: disable=not-callable diff --git a/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py index 2aba20dd45..a4c3898a9b 100644 --- a/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/darts/mutator.py @@ -61,7 +61,7 @@ def sample_final(self): if isinstance(mutable, LayerChoice): max_val, index = torch.max(F.softmax(self.choices[mutable.key], dim=-1)[:-1], 0) edges_max[mutable.key] = max_val - result[mutable.key] = F.one_hot(index, num_classes=mutable.length).view(-1).bool() + result[mutable.key] = F.one_hot(index, num_classes=len(mutable)).view(-1).bool() for mutable in self.mutables: if isinstance(mutable, InputChoice): if mutable.n_chosen is not None: diff --git a/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py index 8cd107ec9d..7763622a58 100644 --- a/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/enas/mutator.py @@ -86,15 +86,15 @@ def __init__(self, model, lstm_size=64, lstm_num_layers=1, tanh_constant=1.5, ce for mutable in self.mutables: if isinstance(mutable, LayerChoice): if self.max_layer_choice == 0: - self.max_layer_choice = mutable.length - assert self.max_layer_choice == mutable.length, \ + self.max_layer_choice = len(mutable) + assert self.max_layer_choice == len(mutable), \ "ENAS mutator requires all layer choice have the same number of candidates." # We are judging by keys and module types to add biases to layer choices. Needs refactor. if "reduce" in mutable.key: def is_conv(choice): return "conv" in str(type(choice)).lower() bias = torch.tensor([self.branch_bias if is_conv(choice) else -self.branch_bias # pylint: disable=not-callable - for choice in mutable.choices]) + for choice in mutable]) self.bias_dict[mutable.key] = nn.Parameter(bias, requires_grad=False) self.embedding = nn.Embedding(self.max_layer_choice + 1, self.lstm_size) diff --git a/src/sdk/pynni/nni/nas/pytorch/mutables.py b/src/sdk/pynni/nni/nas/pytorch/mutables.py index 46d08fd756..5dbed524e0 100644 --- a/src/sdk/pynni/nni/nas/pytorch/mutables.py +++ b/src/sdk/pynni/nni/nas/pytorch/mutables.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import logging +import warnings from collections import OrderedDict import torch.nn as nn @@ -140,9 +141,12 @@ class LayerChoice(Mutable): Attributes ---------- length : int - Number of ops to choose from. - names: list of str + Deprecated. Number of ops to choose from. ``len(layer_choice)`` is recommended. + names : list of str Names of candidates. + choices : list of Module + Deprecated. A list of all candidate modules in the layer choice module. + ``list(layer_choice)`` is recommended, which will serve the same purpose. Notes ----- @@ -156,30 +160,65 @@ class LayerChoice(Mutable): ("conv7x7", nn.Conv2d(7, 16, 128)) ])) + Elements in layer choice can be modified or deleted. Use ``del self.op_choice["conv5x5"]`` or + ``self.op_choice[1] = nn.Conv3d(...)``. Adding more choices is not supported yet. """ def __init__(self, op_candidates, reduction="sum", return_mask=False, key=None): super().__init__(key=key) - self.length = len(op_candidates) - self.choices = [] self.names = [] if isinstance(op_candidates, OrderedDict): for name, module in op_candidates.items(): assert name not in ["length", "reduction", "return_mask", "_key", "key", "names"], \ "Please don't use a reserved name '{}' for your module.".format(name) self.add_module(name, module) - self.choices.append(module) self.names.append(name) elif isinstance(op_candidates, list): for i, module in enumerate(op_candidates): self.add_module(str(i), module) - self.choices.append(module) self.names.append(str(i)) else: raise TypeError("Unsupported op_candidates type: {}".format(type(op_candidates))) self.reduction = reduction self.return_mask = return_mask + def __getitem__(self, idx): + if isinstance(idx, str): + return self._modules[idx] + return list(self)[idx] + + def __setitem__(self, idx, module): + key = idx if isinstance(idx, str) else self.names[idx] + return setattr(self, key, module) + + def __delitem__(self, idx): + if isinstance(idx, slice): + for key in self.names[idx]: + delattr(self, key) + else: + if isinstance(idx, str): + key, idx = idx, self.names.index(idx) + else: + key = self.names[idx] + delattr(self, key) + del self.names[idx] + + @property + def length(self): + warnings.warn("layer_choice.length is deprecated. Use `len(layer_choice)` instead.", DeprecationWarning) + return len(self) + + def __len__(self): + return len(self.names) + + def __iter__(self): + return map(lambda name: self._modules[name], self.names) + + @property + def choices(self): + warnings.warn("layer_choice.choices is deprecated. Use `list(layer_choice)` instead.", DeprecationWarning) + return list(self) + def forward(self, *args, **kwargs): """ Returns diff --git a/src/sdk/pynni/nni/nas/pytorch/mutator.py b/src/sdk/pynni/nni/nas/pytorch/mutator.py index e461d50206..160a20de84 100644 --- a/src/sdk/pynni/nni/nas/pytorch/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/mutator.py @@ -150,16 +150,16 @@ def on_forward_layer_choice(self, mutable, *args, **kwargs): """ if self._connect_all: return self._all_connect_tensor_reduction(mutable.reduction, - [op(*args, **kwargs) for op in mutable.choices]), \ - torch.ones(mutable.length) + [op(*args, **kwargs) for op in mutable]), \ + torch.ones(len(mutable)) def _map_fn(op, args, kwargs): return op(*args, **kwargs) mask = self._get_decision(mutable) - assert len(mask) == len(mutable.choices), \ - "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable.choices)) - out = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable.choices], mask) + assert len(mask) == len(mutable), \ + "Invalid mask, expected {} to be of length {}.".format(mask, len(mutable)) + out = self._select_with_mask(_map_fn, [(choice, args, kwargs) for choice in mutable], mask) return self._tensor_reduction(mutable.reduction, out), mask def on_forward_input_choice(self, mutable, tensor_list): diff --git a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py index 47aedfa1b2..108557f30e 100644 --- a/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/pdarts/mutator.py @@ -32,7 +32,7 @@ def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}): for mutable in self.mutables: if isinstance(mutable, LayerChoice): - switches = self.switches.get(mutable.key, [True for j in range(mutable.length)]) + switches = self.switches.get(mutable.key, [True for j in range(len(mutable))]) choices = self.choices[mutable.key] operations_count = np.sum(switches) @@ -48,12 +48,12 @@ def __init__(self, model, pdarts_epoch_index, pdarts_num_to_drop, switches={}): if isinstance(module, LayerChoice): switches = self.switches.get(module.key) choices = self.choices[module.key] - if len(module.choices) > len(choices): + if len(module) > len(choices): # from last to first, so that it won't effect previous indexes after removed one. for index in range(len(switches)-1, -1, -1): if switches[index] == False: - del(module.choices[index]) - module.length -= 1 + del module[index] + assert len(module) <= len(choices), "Failed to remove dropped choices." def sample_final(self): results = super().sample_final() diff --git a/src/sdk/pynni/nni/nas/pytorch/proxylessnas/mutator.py b/src/sdk/pynni/nni/nas/pytorch/proxylessnas/mutator.py index eb768e6fff..881a6b4403 100644 --- a/src/sdk/pynni/nni/nas/pytorch/proxylessnas/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/proxylessnas/mutator.py @@ -53,15 +53,15 @@ def __init__(self, mutable): A LayerChoice in user model """ super(MixedOp, self).__init__() - self.ap_path_alpha = nn.Parameter(torch.Tensor(mutable.length)) - self.ap_path_wb = nn.Parameter(torch.Tensor(mutable.length)) + self.ap_path_alpha = nn.Parameter(torch.Tensor(len(mutable))) + self.ap_path_wb = nn.Parameter(torch.Tensor(len(mutable))) self.ap_path_alpha.requires_grad = False self.ap_path_wb.requires_grad = False self.active_index = [0] self.inactive_index = None self.log_prob = None self.current_prob_over_ops = None - self.n_choices = mutable.length + self.n_choices = len(mutable) def get_ap_path_alpha(self): return self.ap_path_alpha @@ -120,8 +120,8 @@ def backward(_x, _output, grad_output): return binary_grads return backward output = ArchGradientFunction.apply( - x, self.ap_path_wb, run_function(mutable.key, mutable.choices, self.active_index[0]), - backward_function(mutable.key, mutable.choices, self.active_index[0], self.ap_path_wb)) + x, self.ap_path_wb, run_function(mutable.key, list(mutable), self.active_index[0]), + backward_function(mutable.key, list(mutable), self.active_index[0], self.ap_path_wb)) else: output = self.active_op(mutable)(x) return output @@ -164,7 +164,7 @@ def active_op(self, mutable): PyTorch module the chosen operation """ - return mutable.choices[self.active_index[0]] + return mutable[self.active_index[0]] @property def active_op_index(self): @@ -222,12 +222,12 @@ def binarize(self, mutable): sample = torch.multinomial(probs, 1)[0].item() self.active_index = [sample] self.inactive_index = [_i for _i in range(0, sample)] + \ - [_i for _i in range(sample + 1, len(mutable.choices))] + [_i for _i in range(sample + 1, len(mutable))] self.log_prob = torch.log(probs[sample]) self.current_prob_over_ops = probs self.ap_path_wb.data[sample] = 1.0 # avoid over-regularization - for choice in mutable.choices: + for choice in mutable: for _, param in choice.named_parameters(): param.grad = None @@ -430,8 +430,8 @@ def unused_modules_off(self): involved_index = mixed_op.active_index for i in range(mixed_op.n_choices): if i not in involved_index: - unused[i] = mutable.choices[i] - mutable.choices[i] = None + unused[i] = mutable[i] + mutable[i] = None self._unused_modules.append(unused) def unused_modules_back(self): @@ -442,7 +442,7 @@ def unused_modules_back(self): return for m, unused in zip(self.mutable_list, self._unused_modules): for i in unused: - m.choices[i] = unused[i] + m[i] = unused[i] self._unused_modules = None def arch_requires_grad(self): @@ -474,5 +474,5 @@ def sample_final(self): assert isinstance(mutable, LayerChoice) index, _ = mutable.registered_module.chosen_index # pylint: disable=not-callable - result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=mutable.length).view(-1).bool() + result[mutable.key] = F.one_hot(torch.tensor(index), num_classes=len(mutable)).view(-1).bool() return result diff --git a/src/sdk/pynni/nni/nas/pytorch/random/mutator.py b/src/sdk/pynni/nni/nas/pytorch/random/mutator.py index 2a8cb25ef2..f302db56c0 100644 --- a/src/sdk/pynni/nni/nas/pytorch/random/mutator.py +++ b/src/sdk/pynni/nni/nas/pytorch/random/mutator.py @@ -18,8 +18,8 @@ def sample_search(self): result = dict() for mutable in self.mutables: if isinstance(mutable, LayerChoice): - gen_index = torch.randint(high=mutable.length, size=(1, )) - result[mutable.key] = F.one_hot(gen_index, num_classes=mutable.length).view(-1).bool() + gen_index = torch.randint(high=len(mutable), size=(1, )) + result[mutable.key] = F.one_hot(gen_index, num_classes=len(mutable)).view(-1).bool() elif isinstance(mutable, InputChoice): if mutable.n_chosen is None: result[mutable.key] = torch.randint(high=2, size=(mutable.n_candidates,)).view(-1).bool() diff --git a/src/sdk/pynni/tests/models/pytorch_models/__init__.py b/src/sdk/pynni/tests/models/pytorch_models/__init__.py index 46d4482c86..363c7d3c9c 100644 --- a/src/sdk/pynni/tests/models/pytorch_models/__init__.py +++ b/src/sdk/pynni/tests/models/pytorch_models/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +from .layer_choice_only import LayerChoiceOnlySearchSpace from .mutable_scope import SpaceWithMutableScope from .naive import NaiveSearchSpace from .nested import NestedSpace diff --git a/src/sdk/pynni/tests/models/pytorch_models/layer_choice_only.py b/src/sdk/pynni/tests/models/pytorch_models/layer_choice_only.py new file mode 100644 index 0000000000..c500bc9cdc --- /dev/null +++ b/src/sdk/pynni/tests/models/pytorch_models/layer_choice_only.py @@ -0,0 +1,38 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from nni.nas.pytorch.mutables import LayerChoice + + +class LayerChoiceOnlySearchSpace(nn.Module): + def __init__(self, test_case): + super().__init__() + self.test_case = test_case + self.conv1 = LayerChoice([nn.Conv2d(3, 6, 3, padding=1), nn.Conv2d(3, 6, 5, padding=2)]) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = LayerChoice([nn.Conv2d(6, 16, 3, padding=1), nn.Conv2d(6, 16, 5, padding=2)], + return_mask=True) + self.conv3 = nn.Conv2d(16, 16, 1) + self.bn = nn.BatchNorm2d(16) + + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Linear(16, 10) + + def forward(self, x): + bs = x.size(0) + + x = self.pool(F.relu(self.conv1(x))) + x0, mask = self.conv2(x) + self.test_case.assertEqual(mask.size(), torch.Size([2])) + x1 = F.relu(self.conv3(x0)) + + x = self.pool(self.bn(x1)) + self.test_case.assertEqual(mask.size(), torch.Size([2])) + + x = self.gap(x).view(bs, -1) + x = self.fc(x) + return x diff --git a/src/sdk/pynni/tests/test_nas.py b/src/sdk/pynni/tests/test_nas.py index 53b52541ad..5c1799a4a8 100644 --- a/src/sdk/pynni/tests/test_nas.py +++ b/src/sdk/pynni/tests/test_nas.py @@ -3,6 +3,7 @@ import importlib import os import sys +from collections import OrderedDict from unittest import TestCase, main import torch @@ -11,6 +12,7 @@ from nni.nas.pytorch.darts import DartsMutator from nni.nas.pytorch.enas import EnasMutator from nni.nas.pytorch.fixed import apply_fixed_architecture +from nni.nas.pytorch.mutables import LayerChoice from nni.nas.pytorch.random import RandomMutator from nni.nas.pytorch.utils import _reset_global_mutable_counting @@ -101,6 +103,43 @@ def test_classic_nas(self): get_and_apply_next_architecture(model) self.iterative_sample_and_forward(model) + def test_proxylessnas(self): + model = self.model_module.LayerChoiceOnlySearchSpace(self) + get_and_apply_next_architecture(model) + self.iterative_sample_and_forward(model) + + def test_layer_choice(self): + for i in range(2): + for j in range(2): + if j == 0: + # test number + layer_choice = LayerChoice([nn.Conv2d(3, 3, 3), nn.Conv2d(3, 5, 3), nn.Conv2d(3, 6, 3)]) + else: + # test ordered dict + layer_choice = LayerChoice(OrderedDict([ + ("conv1", nn.Conv2d(3, 3, 3)), + ("conv2", nn.Conv2d(3, 5, 3)), + ("conv3", nn.Conv2d(3, 6, 3)) + ])) + if i == 0: + # test modify + self.assertEqual(len(layer_choice.choices), 3) + layer_choice[1] = nn.Conv2d(3, 4, 3) + self.assertEqual(layer_choice[1].out_channels, 4) + self.assertEqual(len(layer_choice[0:2]), 2) + if j > 0: + layer_choice["conv3"] = nn.Conv2d(3, 7, 3) + self.assertEqual(layer_choice[-1].out_channels, 7) + if i == 1: + # test delete + del layer_choice[1] + self.assertEqual(len(layer_choice), 2) + self.assertEqual(len(list(layer_choice)), 2) + self.assertEqual(layer_choice.names, ["conv1", "conv3"] if j > 0 else ["0", "2"]) + if j > 0: + del layer_choice["conv1"] + self.assertEqual(len(layer_choice), 1) + if __name__ == '__main__': main() diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index a67695c1f4..392235d0cf 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -287,7 +287,7 @@ def setPathCheck(key): 'codeDir': setPathCheck('codeDir'), 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), 'containerNFSMountPath': setType('containerNFSMountPath', str), - 'command': setType('command', str), + Optional('command'): setType('command', str), Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999), Optional('memoryMB'): setType('memoryMB', int), diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index ee35a8213a..fad2599717 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -266,35 +266,14 @@ def validate_pai_config_path(experiment_config): '''validate paiConfigPath field''' if experiment_config.get('trainingServicePlatform') == 'pai': if experiment_config.get('trial', {}).get('paiConfigPath'): - # validate the file format of paiConfigPath, ensure it is yaml format + # validate commands pai_config = get_yml_content(experiment_config['trial']['paiConfigPath']) - if experiment_config['trial'].get('image') is None: - if pai_config.get('prerequisites', [{}])[0].get('uri') is None: - print_error('Please set image field, or set image uri in your own paiConfig!') - exit(1) - experiment_config['trial']['image'] = pai_config['prerequisites'][0]['uri'] - if experiment_config['trial'].get('gpuNum') is None: - if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('gpu') is None: - print_error('Please set gpuNum field, or set resourcePerInstance gpu in your own paiConfig!') - exit(1) - experiment_config['trial']['gpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['gpu'] - if experiment_config['trial'].get('cpuNum') is None: - if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('cpu') is None: - print_error('Please set cpuNum field, or set resourcePerInstance cpu in your own paiConfig!') - exit(1) - experiment_config['trial']['cpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['cpu'] - if experiment_config['trial'].get('memoryMB') is None: - if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('memoryMB', {}) is None: - print_error('Please set memoryMB field, or set resourcePerInstance memoryMB in your own paiConfig!') - exit(1) - experiment_config['trial']['memoryMB'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['memoryMB'] - if experiment_config['trial'].get('paiStoragePlugin') is None: - if pai_config.get('extras', {}).get('com.microsoft.pai.runtimeplugin', [{}])[0].get('plugin') is None: - print_error('Please set paiStoragePlugin field, or set plugin in your own paiConfig!') - exit(1) - experiment_config['trial']['paiStoragePlugin'] = pai_config['extras']['com.microsoft.pai.runtimeplugin'][0]['plugin'] + taskRoles_dict = pai_config.get('taskRoles') + if not taskRoles_dict: + print_error('Please set taskRoles in paiConfigPath config file!') + exit(1) else: - pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin'] + pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin', 'command'] for trial_field in pai_trial_fields_required_list: if experiment_config['trial'].get(trial_field) is None: print_error('Please set {0} in trial configuration,\