From 9a8ac161e066e5ed11c98f7f30681bb815952f21 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Wed, 19 Sep 2018 18:17:57 +0800 Subject: [PATCH 01/60] PAI Training service implementation, v1 --- Makefile | 18 +- setup.py | 5 +- src/nni_manager/main.ts | 7 +- src/nni_manager/package.json | 4 +- .../rest_server/restValidationSchemas.ts | 14 +- .../training_service/common/jobMetrics.ts | 37 +++ .../common/trialConfigMetadataKey.ts | 3 +- .../training_service/pai/hdfsClientUtility.ts | 200 ++++++++++++ .../training_service/pai/paiConfig.ts | 123 ++++++++ .../training_service/pai/paiData.ts | 48 +++ .../training_service/pai/paiJobRestServer.ts | 73 ++++- .../pai/paiTrainingService.ts | 298 ++++++++++++++++++ .../training_service/pai/paiTrialConfig.ts | 39 +++ .../remote_machine/metricsCollector.ts | 3 +- .../remote_machine/remoteMachineData.ts | 19 +- .../remoteMachineTrainingService.ts | 8 +- .../test/hdfsClientUtility.test.ts | 143 +++++++++ .../test/paiTrainingService.test.ts | 95 ++++++ .../training_service_tool/setup.py | 20 ++ .../training_service_tool/trial/__init__.py | 0 .../training_service_tool/trial/constants.py | 37 +++ .../trial/metrics_reader.py | 123 ++++++++ .../training_service_tool/trial/rest_utils.py | 57 ++++ .../trial/trial_keeper.py | 73 +++++ .../training_service_tool/trial/url_utils.py | 25 ++ src/nni_manager/yarn.lock | 116 ++++++- 26 files changed, 1544 insertions(+), 44 deletions(-) create mode 100644 src/nni_manager/training_service/common/jobMetrics.ts create mode 100644 src/nni_manager/training_service/pai/hdfsClientUtility.ts create mode 100644 src/nni_manager/training_service/pai/paiConfig.ts create mode 100644 src/nni_manager/training_service/pai/paiData.ts create mode 100644 src/nni_manager/training_service/pai/paiTrainingService.ts create mode 100644 src/nni_manager/training_service/pai/paiTrialConfig.ts create mode 100644 src/nni_manager/training_service/test/hdfsClientUtility.test.ts create mode 100644 src/nni_manager/training_service/test/paiTrainingService.test.ts create mode 100644 src/nni_manager/training_service_tool/setup.py create mode 100644 src/nni_manager/training_service_tool/trial/__init__.py create mode 100644 src/nni_manager/training_service_tool/trial/constants.py create mode 100644 src/nni_manager/training_service_tool/trial/metrics_reader.py create mode 100644 src/nni_manager/training_service_tool/trial/rest_utils.py create mode 100644 src/nni_manager/training_service_tool/trial/trial_keeper.py create mode 100644 src/nni_manager/training_service_tool/trial/url_utils.py diff --git a/Makefile b/Makefile index f429fc20bb..1604fb1937 100644 --- a/Makefile +++ b/Makefile @@ -90,14 +90,9 @@ build: #$(_INFO) Building nnictl $(_END) cd tools && python3 setup.py build + #$(_INFO) Building Training Service tool $(_END) + cd src/nni_manager/training_service_tool && python3 setup.py build -# Standard installation target -# Must be invoked after building -.PHONY: install -install: install-python-modules -install: install-node-modules -install: install-scripts -install: install-examples install: #$(_INFO) Complete! You may want to add $(BIN_PATH) to your PATH environment $(_END) @@ -107,6 +102,7 @@ install: .PHONY: remote-machine-install remote-machine-install: cd src/sdk/pynni && python3 setup.py install $(PIP_MODE) + cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) # All-in-one target for non-expert users @@ -145,6 +141,7 @@ dev-install: uninstall: -pip3 uninstall -y nni -pip3 uninstall -y nnictl + -pip3 uninstall -y nnits-tool -rm -rf $(INSTALL_PREFIX)/nni -rm -f $(BIN_PATH)/nnimanager -rm -f $(BIN_PATH)/nnictl @@ -203,6 +200,8 @@ install-python-modules: #$(_INFO) Installing nnictl $(_END) cd tools && python3 setup.py install $(PIP_MODE) + #$(_INFO) Installing NNI training service tool $(_END) + cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) .PHONY: install-node-modules install-node-modules: @@ -223,7 +222,10 @@ install-dev-modules: #$(_INFO) Installing nnictl $(_END) cd tools && pip3 install $(PIP_MODE) -e . - + + #$(_INFO) Installing NNI training service tool $(_END) + cd src/nni_manager/training_service_tool && pip3 install $(PIP_MODE) -e . + mkdir -p $(INSTALL_PREFIX)/nni #$(_INFO) Installing NNI Manager $(_END) diff --git a/setup.py b/setup.py index eeee54d075..9545ead48c 100644 --- a/setup.py +++ b/setup.py @@ -65,11 +65,12 @@ def run(self): license = 'MIT', url = 'https://github.com/Microsoft/nni', - packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools'), + packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools') + find_packages('src/nni_manager/training_service_tool'), package_dir = { 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', - 'nnicmd': 'tools/nnicmd' + 'nnicmd': 'tools/nnicmd', + 'trialkeeper': 'src/nni_manager/training_service_tool/trialkeeper' }, python_requires = '>=3.5', install_requires = [ diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 6d9c9fa64b..f3d386eccd 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -36,6 +36,7 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; +import { PAITrainingService } from './training_service/pai/paiTrainingService' function initStartupInfo(startExpMode: string, resumeExperimentId: string) { @@ -49,6 +50,8 @@ async function initContainer(platformMode: string): Promise { Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton); } else if (platformMode === 'remote') { Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton); + } else if (platformMode === 'pai'){ + Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton); } else { throw new Error(`Error: unsupported mode: ${mode}`); } @@ -61,7 +64,7 @@ async function initContainer(platformMode: string): Promise { } function usage(): void { - console.info('usage: node main.js --port --mode --start_mode --experiment_id '); + console.info('usage: node main.js --port --mode --start_mode --experiment_id '); } let port: number = NNIRestServer.DEFAULT_PORT; @@ -71,7 +74,7 @@ if (strPort && strPort.length > 0) { } const mode: string = parseArg(['--mode', '-m']); -if (!['local', 'remote'].includes(mode)) { +if (!['local', 'remote', 'pai'].includes(mode)) { usage(); process.exit(1); } diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 46522044fd..04ee4df3c2 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -23,7 +23,8 @@ "tree-kill": "^1.2.0", "ts-deferred": "^1.0.4", "typescript-ioc": "^1.2.4", - "typescript-string-operations": "^1.3.1" + "typescript-string-operations": "^1.3.1", + "webhdfs":"^1.2.0" }, "devDependencies": { "@types/chai": "^4.1.4", @@ -40,6 +41,7 @@ "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", + "rmdir": "^1.2.0", "tmp": "^0.0.33", "ts-node": "^7.0.0", "tslint": "^5.11.0", diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 218a8c22c4..22f1acb222 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -33,9 +33,19 @@ export namespace ValidationSchemas { passphrase: joi.string() })), trial_config: joi.object({ - gpuNum: joi.number().min(0).required(), + image: joi.string().min(1), codeDir: joi.string().min(1).required(), - command: joi.string().min(1).required() + dataDir: joi.string(), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + pai_config: joi.object({ + userName: joi.string().min(1).required(), + passWord: joi.string().min(1).required(), + host: joi.string().min(1).required() }) } }; diff --git a/src/nni_manager/training_service/common/jobMetrics.ts b/src/nni_manager/training_service/common/jobMetrics.ts new file mode 100644 index 0000000000..90228ffa7d --- /dev/null +++ b/src/nni_manager/training_service/common/jobMetrics.ts @@ -0,0 +1,37 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { TrialJobStatus } from '../../common/trainingService'; + +// tslint:disable-next-line:max-classes-per-file +export class JobMetrics { + public readonly jobId: string; + public readonly metrics: string[]; + public readonly jobStatus: TrialJobStatus; + public readonly endTimestamp: number; + + constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) { + this.jobId = jobId; + this.metrics = metrics; + this.jobStatus = jobStatus; + this.endTimestamp = endTimestamp; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index e9749e562e..12df449ee1 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey { MACHINE_LIST = 'machine_list', TRIAL_CONFIG = 'trial_config', EXPERIMENT_ID = 'experimentId', - RANDOM_SCHEDULER = 'random_scheduler' + RANDOM_SCHEDULER = 'random_scheduler', + PAI_CLUSTER_CONFIG = 'pai_config' } diff --git a/src/nni_manager/training_service/pai/hdfsClientUtility.ts b/src/nni_manager/training_service/pai/hdfsClientUtility.ts new file mode 100644 index 0000000000..07dcc2a744 --- /dev/null +++ b/src/nni_manager/training_service/pai/hdfsClientUtility.ts @@ -0,0 +1,200 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +import * as path from 'path'; +import * as fs from 'fs'; +import { Deferred } from 'ts-deferred'; +import { getLogger } from '../../common/log'; + +/** + * HDFS client utility, including copy file/directory + */ +export namespace HDFSClientUtility { + /** + * Copy a local file to hdfs directory + * + * @param localFilePath local file path(source) + * @param hdfsFilePath hdfs file path(target) + * @param hdfsClient hdfs client + */ + export async function copyFileToHdfs(localFilePath : string, hdfsFilePath : string, hdfsClient : any) : Promise { + const deferred: Deferred = new Deferred(); + fs.exists(localFilePath, (exists : boolean) => { + // Detect if local file exist + if (exists) { + var localFileStream = fs.createReadStream(localFilePath); + var hdfsFileStream = hdfsClient.createWriteStream(hdfsFilePath); + localFileStream.pipe(hdfsFileStream); + hdfsFileStream.on('finish', function onFinish () { + deferred.resolve(); + }); + hdfsFileStream.on('error', (err : any) => { + getLogger().error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`); + deferred.reject(err); + }); + } else { + getLogger().error(`HDFSCientUtility:copyFileToHdfs, ${localFilePath} doesn't exist locally`); + deferred.reject('file not exist!'); + } + }); + return deferred.promise; + } + + /** + * Recursively copy local directory to hdfs directory + * + * @param localDirectory local directory + * @param hdfsDirectory HDFS directory + * @param hdfsClient HDFS client + */ + export async function copyDirectoryToHdfs(localDirectory : string, hdfsDirectory : string, hdfsClient : any) : Promise{ + const deferred: Deferred = new Deferred(); + // TODO: fs.readdirSync doesn't support ~($HOME) + const fileNameArray: string[] = fs.readdirSync(localDirectory); + + for(var fileName of fileNameArray){ + const fullFilePath: string = path.join(localDirectory, fileName); + try { + if (fs.lstatSync(fullFilePath).isFile()) { + await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient); + } else { + // If filePath is a directory, recuisively copy it to remote directory + await copyDirectoryToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient); + } + } catch(error) { + deferred.reject(error); + } + } + // All files/directories are copied successfully, resolve + deferred.resolve(); + + return deferred.promise; + } + + /** + * Read content from HDFS file + * + * @param hdfsPath HDFS file path + * @param hdfsClient HDFS client + */ + export async function readFileFromHDFS(hdfsPath : string, hdfsClient :any) : Promise { + const deferred: Deferred = new Deferred(); + let buffer : Buffer = Buffer.alloc(0); + + const exist : boolean = await pathExists(hdfsPath, hdfsClient); + if(!exist) { + deferred.reject(`${hdfsPath} doesn't exists`); + } + + const remoteFileStream = hdfsClient.createReadStream(hdfsPath); + remoteFileStream.on('error', (err : any) => { + // Reject with the error + deferred.reject(err); + }); + + remoteFileStream.on('data', (chunk : any) => { + // Concat the data chunk to buffer + buffer = Buffer.concat([buffer, chunk]); + }); + + remoteFileStream.on('finish', function onFinish () { + // Upload is done, resolve + deferred.resolve(buffer); + }); + + return deferred.promise; + } + + /** + * Check if an HDFS path already exists + * + * @param hdfsPath target path need to check in HDFS + * @param hdfsClient HDFS client + */ + export async function pathExists(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + hdfsClient.exists(hdfsPath, (exist : boolean ) => { + deferred.resolve(exist); + }) + + return deferred.promise; + } + + /** + * Mkdir in HDFS, use default permission 755 + * + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + */ + export function mkdir(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + + hdfsClient.mkdir(hdfsPath, (err : any)=> { + if(!err) { + deferred.resolve(true); + } else { + deferred.reject(err.message); + } + }); + + return deferred.promise; + } + + /** + * Read directory contents + * + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + */ + export async function readdir(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + const exist : boolean = await pathExists(hdfsPath, hdfsClient); + if(!exist) { + deferred.reject(`${hdfsPath} doesn't exists`); + } + + hdfsClient.readdir(hdfsPath, (err : any, files : any[] ) => { + if(err) { + deferred.reject(err); + } + + deferred.resolve(files); + }); + + return deferred.promise; + } + + /** + * Delete HDFS path + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + * @param recursive Mark if need to delete recursively + */ + export function deletePath(hdfsPath : string, hdfsClient : any, recursive : boolean = true) : Promise { + const deferred : Deferred = new Deferred(); + hdfsClient.unlink(hdfsPath, recursive, (err : any)=> { + if(!err) { + deferred.resolve(true); + } else { + deferred.reject(err.message); + } + }); + return deferred.promise; + } +} diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts new file mode 100644 index 0000000000..782e0790f1 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -0,0 +1,123 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import {TrialConfig} from '../common/trialConfig' + +export class PAITaskRole { + // Name for the task role + public readonly name: string; + // Number of tasks for the task role, no less than 1 + public readonly taskNumber: number; + // CPU number for one task in the task role, no less than 1 + public readonly cpuNumber: number; + // Memory for one task in the task role, no less than 100 + public readonly memoryMB: number; + // GPU number for one task in the task role, no less than 0 + public readonly gpuNumber: number; + // Executable command for tasks in the task role, can not be empty + public readonly command: string; + + /** + * Constructor + * @param name Name for the task role + * @param taskNumber Number of tasks for the task role, no less than 1 + * @param cpuNumber CPU number for one task in the task role, no less than 1 + * @param memoryMB Memory for one task in the task role, no less than 100 + * @param gpuNumber GPU number for one task in the task role, no less than 0 + * @param command Executable command for tasks in the task role, can not be empty + */ + constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, command : string) { + this.name = name; + this.taskNumber = taskNumber; + this.cpuNumber = cpuNumber; + this.memoryMB = memoryMB; + this.gpuNumber = gpuNumber; + this.command = command; + } +} + +export class PAIJobConfig{ + // Name for the job, need to be unique + public readonly jobName: string; + // URL pointing to the Docker image for all tasks in the job + public readonly image: string; + // Data directory existing on HDFS + public readonly dataDir: string; + // Output directory on HDFS + public readonly outputDir: string; + // Code directory on HDFS + public readonly codeDir: string; + + // List of taskRole, one task role at least + public taskRoles: PAITaskRole[]; + + /** + * Constructor + * @param jobName Name for the job, need to be unique + * @param image URL pointing to the Docker image for all tasks in the job + * @param dataDir Data directory existing on HDFS + * @param outputDir Output directory on HDFS + * @param taskRoles List of taskRole, one task role at least + */ + constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, taskRoles : PAITaskRole[]){ + this.jobName = jobName; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + this.codeDir = codeDir; + this.taskRoles = taskRoles; + } +} + +export class PAIClusterConfig { + public readonly userName: string; + public readonly passWord: string; + public readonly host: string; + + /** + * Constructor + * @param userName User name of PAI Cluster + * @param passWord password of PAI Cluster + * @param host Host IP of PAI Cluster + */ + constructor(userName: string, passWord : string, host : string){ + this.userName = userName; + this.passWord = passWord; + this.host = host; + } +} + +export class NNIPAITrialConfig extends TrialConfig{ + public readonly cpuNum: number; + public readonly memoryMB: number; + public readonly image: string; + public readonly dataDir: string; + public readonly outputDir: string; + + constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { + super(command, codeDir, gpuNum); + this.cpuNum = cpuNum; + this.memoryMB = memoryMB; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts new file mode 100644 index 0000000000..0270ef0e12 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -0,0 +1,48 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from 'common/trainingService'; + +export class PAITrialJobDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: JobApplicationForm; + + constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) { + this.id = id; + this.status = status; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + } +} + +export const PAI_TRIAL_COMMAND_FORMAT: string = +`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} +&& cd $NNI_SYS_DIR +&& python3 -m trial.trial_keeper --trial_command '{2}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 6375eee1c5..3079cd47ec 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -17,4 +17,75 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - \ No newline at end of file +'use strict'; + +import { Request, Response, Router } from 'express'; +import * as bodyParser from 'body-parser'; +import * as component from '../../common/component'; +import { RestServer } from '../../common/restServer' +import { Inject } from 'typescript-ioc'; +import { PAITrainingService } from './paiTrainingService'; + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +@component.Singleton +export class PAIJobRestServer extends RestServer{ + /** NNI main rest service default port */ + private static readonly DEFAULT_PORT: number = 51189; + + private readonly API_ROOT_URL: string = '/api/v1/nni-pai'; + + @Inject + private readonly paiTrainingService : PAITrainingService; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(); + this.port = PAIJobRestServer.DEFAULT_PORT; + this.paiTrainingService = component.get(PAITrainingService); + } + + /** + * NNIRestServer's own router registration + */ + protected registerRestHandler(): void { + this.app.use(bodyParser.json()); + this.app.use(this.API_ROOT_URL, this.createRestHandler()); + } + + private createRestHandler() : Router { + const router: Router = Router(); + + // tslint:disable-next-line:typedef + router.use((req: Request, res: Response, next) => { + this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`); + res.setHeader('Content-Type', 'application/json'); + next(); + }); + + router.post('/update-metrics/:id', (req: Request, res: Response) => { + try { + this.log.info(`Get update-metrics request, trial job id is ${req.params.id}`); + this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`); + + this.paiTrainingService.MetricsEmitter.emit('metric', { + id : req.body.jobId, + data : req.body.metrics + }); + + res.send(); + } + catch(err) { + this.log.error(`json parse metrics error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + return router; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts new file mode 100644 index 0000000000..a72900465e --- /dev/null +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -0,0 +1,298 @@ + +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict' + +import * as assert from 'assert'; +import * as component from '../../common/component'; +import * as path from 'path'; +import * as request from 'request'; + +import { Deferred } from 'ts-deferred'; +import { EventEmitter } from 'events'; +import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors'; +import { getLogger, Logger } from '../../common/log'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { + HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, + TrialJobDetail, TrialJobMetric, TrialJobStatus +} from '../../common/trainingService'; +import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; +import { ObservableTimer } from '../../common/observableTimer'; +import { PAIJobRestServer } from './paiJobRestServer' +import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; +import { String } from 'typescript-string-operations'; +import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; +import { HDFSClientUtility } from './hdfsClientUtility' +import { getExperimentId } from '../../common/experimentStartupInfo'; + + +var WebHDFS = require('webhdfs'); + +/** + * Training Service implementation for OpenPAI (Open Platform for AI) + * Refer https://github.com/Microsoft/pai for more info about OpenPAI + */ +@component.Singleton +class PAITrainingService implements TrainingService { + private readonly log!: Logger; + private readonly metricsEmitter: EventEmitter; + private readonly trialJobsMap: Map; + private readonly expRootDir: string; + private paiTrialConfig: NNIPAITrialConfig | undefined; + private paiClusterConfig?: PAIClusterConfig; + private stopping: boolean = false; + private hdfsClient: any; + private paiToken? : string; + private experimentId! : string; + + constructor(@component.Inject timer: ObservableTimer) { + this.log = getLogger(); + this.metricsEmitter = new EventEmitter(); + this.trialJobsMap = new Map(); + // Root dir on HDFS + this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); + this.experimentId = getExperimentId(); + } + + public async run(): Promise { + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + await restServer.start(); + this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); + } + + public listTrialJobs(): Promise { + const deferred : Deferred = new Deferred(); + + deferred.resolve([]); + return deferred.promise; + } + + public getTrialJob(trialJobId: string): Promise { + const deferred : Deferred = new Deferred(); + + deferred.resolve(undefined); + return deferred.promise; + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) { + this.metricsEmitter.on('metric', listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) { + this.metricsEmitter.off('metric', listener); + } + + public async submitTrialJob(form: JobApplicationForm): Promise { + const deferred : Deferred = new Deferred(); + if(!this.paiClusterConfig) { + throw new Error('PAI Cluster config is not initialized'); + } + if (!this.paiTrialConfig) { + throw new Error('trial config is not initialized'); + } + if (!this.paiToken) { + throw new Error('PAI token is not initialized'); + } + + this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); + + const trialJobId: string = uniqueString(5); + //TODO: use HDFS working folder instead + const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); + + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( + trialJobId, + 'WAITING', + Date.now(), + trialWorkingFolder, + form); + this.trialJobsMap.set(trialJobId, trialJobDetail); + + // Step 1. Prepare PAI job configuration + const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); + + const nniPaiTrialCommand : string = String.Format( + PAI_TRIAL_COMMAND_FORMAT, + `./${trialJobId}`, + trialJobId, + this.paiTrialConfig.command + ).replace(/\r\n|\n|\r/gm, ''); + + console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); + const paiTaskRoles : PAITaskRole[] = [new PAITaskRole('nni_trail_' + trialJobId, + // Task role number + 1, + // Task CPU number + this.paiTrialConfig.cpuNum, + // Task memory + this.paiTrialConfig.memoryMB, + // Task GPU number + this.paiTrialConfig.gpuNum, + // Task command + nniPaiTrialCommand)]; + + const paiJobConfig : PAIJobConfig = new PAIJobConfig( + // Job name + paiJobName, + // Docker image + this.paiTrialConfig.image, + // dataDir + this.paiTrialConfig.dataDir, + // outputDir + this.paiTrialConfig.outputDir, + // codeDir + `$PAI_DEFAULT_FS_URI${hdfsCodeDir}`, + // TODO: Add Virutal Cluster + // PAI Task roles + paiTaskRoles); + console.log(`PAI job config is ${JSON.stringify(paiJobConfig)}`); + + // Step 2. Upload code files in codeDir onto HDFS + try { + await HDFSClientUtility.copyDirectoryToHdfs(this.paiTrialConfig.codeDir, hdfsCodeDir, this.hdfsClient); + } catch (error) { + this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); + throw new Error(error.message); + } + + // Step 3. Submit PAI job via Rest call + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const submitJobRequest: request.Options = { + uri: `http://${this.paiClusterConfig.host}:9186/api/v1/jobs`, + method: 'POST', + json: true, + body: paiJobConfig, + headers: { + "Content-Type": "application/json", + "Authorization": 'Bearer ' + this.paiToken + } + }; + request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + if (error || response.statusCode >= 400) { + this.log.error(`PAI Training service: Submit trial ${trialJobId} to PAI Cluster failed!`); + trialJobDetail.status = 'FAILED'; + deferred.reject(error ? error.message : 'Submit trial failed, http code: ' + response.statusCode); + } else { + deferred.resolve(trialJobDetail); + } + }); + + return deferred.promise; + } + + public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { + throw new MethodNotImplementedError(); + } + + public get isMultiPhaseJobSupported(): boolean { + return false; + } + + public cancelTrialJob(trialJobId: string): Promise { + this.log.info(`PAI Training service cancelTrialJob: jobId: ${trialJobId}`); + const deferred : Deferred = new Deferred(); + + deferred.resolve(); + return deferred.promise; + } + + public setClusterMetadata(key: string, value: string): Promise { + const deferred : Deferred = new Deferred(); + + switch (key) { + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + //TODO: try catch exception when setting up HDFS client and get PAI token + this.paiClusterConfig = JSON.parse(value); + + this.hdfsClient = WebHDFS.createClient({ + user: this.paiClusterConfig.userName, + port: 50070, + host: this.paiClusterConfig.host + }); + + // Get PAI authentication token + const authentication_req: request.Options = { + uri: `http://${this.paiClusterConfig.host}:9186/api/v1/token`, + method: 'POST', + json: true, + body: { + username: this.paiClusterConfig.userName, + password: this.paiClusterConfig.passWord + } + }; + + request(authentication_req, (error: Error, response: request.Response, body: any) => { + if (error) { + //TODO: should me make the setClusterMetadata's return type to Promise? + this.log.error(`Get PAI token failed: ${error.message}`); + deferred.reject(); + } else { + if(response.statusCode !== 200){ + this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`); + deferred.reject(); + } + this.paiToken = body.token; + + console.log(`Got token ${this.paiToken} from PAI Cluster`); + deferred.resolve(); + } + }); + break; + case TrialConfigMetadataKey.TRIAL_CONFIG: + if (!this.paiClusterConfig){ + this.log.error('pai cluster config is not initialized'); + deferred.reject(); + break; + } + this.paiTrialConfig = JSON.parse(value); + console.log(`Set Cluster metadata: paiTrialConfig is ${JSON.stringify(this.paiTrialConfig)}`); + deferred.resolve(); + break; + default: + //Reject for unknown keys + throw new Error(`Uknown key: ${key}`); + } + + return deferred.promise; + } + + public getClusterMetadata(key: string): Promise { + const deferred : Deferred = new Deferred(); + + deferred.resolve(); + return deferred.promise; + } + + public cleanUp(): Promise { + const deferred : Deferred = new Deferred(); + + deferred.resolve(); + return deferred.promise; + } + + + public get MetricsEmitter() : EventEmitter { + return this.metricsEmitter; + } +} + +export { PAITrainingService } \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrialConfig.ts b/src/nni_manager/training_service/pai/paiTrialConfig.ts new file mode 100644 index 0000000000..583db9e725 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiTrialConfig.ts @@ -0,0 +1,39 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import {TrialConfig} from '../common/trialConfig' + +export class PAITrialConfig extends TrialConfig{ + public readonly cpuNum: number; + public readonly memoryMB: number; + public readonly image: string; + public readonly dataDir: string; + public readonly outputDir: string; + + constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { + super(command, codeDir, gpuNum); + this.cpuNum = cpuNum; + this.memoryMB = memoryMB; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/remote_machine/metricsCollector.ts b/src/nni_manager/training_service/remote_machine/metricsCollector.ts index 3e346e7000..eb59a51d99 100644 --- a/src/nni_manager/training_service/remote_machine/metricsCollector.ts +++ b/src/nni_manager/training_service/remote_machine/metricsCollector.ts @@ -25,7 +25,8 @@ import * as path from 'path'; import { Client } from 'ssh2'; import { getLogger, Logger } from '../../common/log'; import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService'; -import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; +import { JobMetrics } from '../common/jobMetrics'; +import { RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; import { SSHClientUtility } from './sshClientUtility'; export class MetricsCollector { diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts index 1e52458790..0cd3a028dc 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts @@ -65,21 +65,6 @@ export class RemoteCommandResult { } } -// tslint:disable-next-line:max-classes-per-file -export class JobMetrics { - public readonly jobId: string; - public readonly metrics: string[]; - public readonly jobStatus: TrialJobStatus; - public readonly endTimestamp: number; - - constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) { - this.jobId = jobId; - this.metrics = metrics; - this.jobStatus = jobStatus; - this.endTimestamp = endTimestamp; - } -} - /** * RemoteMachineTrialJobDetail */ @@ -121,7 +106,7 @@ export enum ScheduleResultType { REQUIRE_EXCEED_TOTAL } -export const REMOTEMACHINERUNSHELLFORMAT: string = +export const REMOTEMACHINE_RUN_SHELL_FORMAT: string = `#!/bin/bash export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} cd $NNI_SYS_DIR @@ -129,7 +114,7 @@ echo $$ >{2} eval {3}{4} 2>{5} echo $? \`date +%s%3N\` >{6}`; -export const HOSTJOBSHELLFORMAT: string = +export const HOST_JOB_SHELL_FORMAT: string = `#!/bin/bash cd {0} echo $$ >{1} diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 772b93ff5d..e1cff16f22 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -43,8 +43,8 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { GPUScheduler } from './gpuScheduler'; import { MetricsCollector } from './metricsCollector'; import { - HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta, - REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, + HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta, + REMOTEMACHINE_RUN_SHELL_FORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType } from './remoteMachineData'; import { SSHClientUtility } from './sshClientUtility'; @@ -427,7 +427,7 @@ class RemoteMachineTrainingService implements TrainingService { // RemoteMachineRunShellFormat is the run shell format string, // See definition in remoteMachineData.ts const runScriptContent: string = String.Format( - REMOTEMACHINERUNSHELLFORMAT, + REMOTEMACHINE_RUN_SHELL_FORMAT, trialWorkingFolder, trialJobId, path.join(trialWorkingFolder, '.nni', 'jobpid'), @@ -470,7 +470,7 @@ class RemoteMachineTrainingService implements TrainingService { await cpp.exec(`mkdir -p ${localDir}`); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteDir}`, sshClient); const runScriptContent: string = String.Format( - HOSTJOBSHELLFORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code') + HOST_JOB_SHELL_FORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code') ); await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' }); await SSHClientUtility.copyFileToRemote( diff --git a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts new file mode 100644 index 0000000000..b8cf30e83a --- /dev/null +++ b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts @@ -0,0 +1,143 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; +import * as chai from 'chai'; +import * as chaiAsPromised from 'chai-as-promised'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import * as tmp from 'tmp'; +import { cleanupUnitTest, prepareUnitTest, uniqueString } from '../../common/utils'; +import { HDFSClientUtility } from '../pai/hdfsClientUtility'; + +var WebHDFS = require('webhdfs'); +var rmdir = require('rmdir'); + +describe('WebHDFS', function () { + /* + To enable web HDFS client unit test, HDFS information needs to be configured in: + Default/.vscode/hdfsInfo.json, whose content looks like: + { + "user": "user1", + "port": 50070, + "host": "10.0.0.0" + } + */ + let skip: boolean = false; + let testHDFSInfo: any; + let hdfsClient: any; + try { + testHDFSInfo = JSON.parse(fs.readFileSync('../../.vscode/hdfsInfo.json', 'utf8')); + console.log(testHDFSInfo); + hdfsClient = WebHDFS.createClient({ + user: testHDFSInfo.user, + port: testHDFSInfo.port, + host: testHDFSInfo.host + }); + } catch (err) { + console.log('Please configure rminfo.json to enable remote machine unit test.'); + skip = true; + } + + before(() => { + chai.should(); + chai.use(chaiAsPromised); + tmp.setGracefulCleanup(); + prepareUnitTest(); + }); + + after(() => { + cleanupUnitTest(); + }); + + it('Test HDFS utility path functions', async () => { + if (skip) { + return; + } + const testPath : string = '/nni_unittest_' + uniqueString(6); + let exists : boolean = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The new random named path is expected to not exist + chai.expect(exists).to.be.equals(false); + + const mkdirResult : boolean = await HDFSClientUtility.mkdir(testPath, hdfsClient); + // Mkdir is expected to be successful + chai.expect(mkdirResult).to.be.equals(true); + + exists = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The newly created path is expected to exist + chai.expect(exists).to.be.equals(true); + + const deleteResult : boolean = await HDFSClientUtility.deletePath(testPath, hdfsClient); + // Delete path is expected to be successful + chai.expect(deleteResult).to.be.equals(true); + + exists = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The deleted path is not expected to exist + chai.expect(exists).to.be.equals(false); + }); + + it('Test HDFS utility copyFileToHdfs', async() => { + if (skip) { + return; + } + // Prepare local directory and files + const tmpLocalDirectoryPath : string = path.join(os.tmpdir(), 'nni_unittest_dir_' + uniqueString(6)); + const tmpDataFilePath : string = path.join(tmpLocalDirectoryPath, 'file_' + uniqueString(6)); + const testFileData : string = 'TestContent123'; + fs.mkdirSync(tmpLocalDirectoryPath); + fs.writeFileSync(tmpDataFilePath, testFileData); + + const testHDFSFilePath : string = '/nni_unittest_' + uniqueString(6); + let exists : boolean = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient); + // The new random named path is expected to not exist + chai.expect(exists).to.be.equals(false); + + await HDFSClientUtility.copyFileToHdfs(tmpDataFilePath, testHDFSFilePath, hdfsClient); + exists = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient); + // After copy local file to HDFS, the target file path in HDFS is expected to exist + chai.expect(exists).to.be.equals(true); + + const buffer : Buffer = await HDFSClientUtility.readFileFromHDFS(testHDFSFilePath, hdfsClient); + const actualFileData : string = buffer.toString('utf8'); + // The file content read from HDFS is expected to equal to the content of local file + chai.expect(actualFileData).to.be.equals(testFileData); + + const testHDFSDirPath : string = path.join('/nni_unittest_' + uniqueString(6) + '_dir'); + + await HDFSClientUtility.copyDirectoryToHdfs(tmpLocalDirectoryPath, testHDFSDirPath, hdfsClient); + + const files : any[] = await HDFSClientUtility.readdir(testHDFSDirPath, hdfsClient); + + // Expected file count under HDFS target directory is 1 + chai.expect(files.length).to.be.equals(1); + + // Expected file name under HDFS target directory is equal to local file name + chai.expect(files[0].pathSuffix).to.be.equals(path.parse(tmpDataFilePath).base); + + // Cleanup + rmdir(tmpLocalDirectoryPath); + + let deleteRestult : boolean = await HDFSClientUtility.deletePath(testHDFSFilePath, hdfsClient); + chai.expect(deleteRestult).to.be.equals(true); + + deleteRestult = await HDFSClientUtility.deletePath(testHDFSDirPath, hdfsClient); + chai.expect(deleteRestult).to.be.equals(true); + }); +}); \ No newline at end of file diff --git a/src/nni_manager/training_service/test/paiTrainingService.test.ts b/src/nni_manager/training_service/test/paiTrainingService.test.ts new file mode 100644 index 0000000000..4294e4ddc1 --- /dev/null +++ b/src/nni_manager/training_service/test/paiTrainingService.test.ts @@ -0,0 +1,95 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as chai from 'chai'; +import * as chaiAsPromised from 'chai-as-promised'; +import * as fs from 'fs'; +import * as tmp from 'tmp'; +import * as component from '../../common/component'; +import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { PAITrainingService } from '../pai/paiTrainingService'; + +// TODO: copy mockedTrail.py to local folder +const localCodeDir: string = tmp.dirSync().name +const mockedTrialPath: string = './training_service/test/mockedTrial.py' +fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py') + +describe('Unit Test for PAITrainingService', () => { + let skip: boolean = false; + let testPaiClusterInfo: any; + let paiCluster: any; + let paiTrialConfig : any; + try { + testPaiClusterInfo = JSON.parse(fs.readFileSync('../../.vscode/paiCluster.json', 'utf8')); + paiCluster = `{\"userName\":\"${testPaiClusterInfo.userName}\",\"passWord\":\"${testPaiClusterInfo.passWord}\",\"host\":\"${testPaiClusterInfo.host}\"}`; + paiTrialConfig = `{\"command\":\"echo hello && ls\",\"codeDir\":\"/home/desy/nni/examples/trials/mnist",\"gpuNum\":\"1\", +\"cpuNum\":\"1\",\"memoryMB\":\"8196\",\"image\":\"openpai/pai.example.tensorflow\",\"dataDir\":\"\",\"outputDir\":\"\"}`; + } catch (err) { + console.log('Please configure rminfo.json to enable remote machine unit test.'); + skip = true; + } + + let paiTrainingService: PAITrainingService; + + console.log(tmp.dirSync().name); + + before(() => { + chai.should(); + chai.use(chaiAsPromised); + prepareUnitTest(); + }); + + after(() => { + cleanupUnitTest(); + }); + + beforeEach(() => { + if (skip) { + return; + } + paiTrainingService = component.get(PAITrainingService); + paiTrainingService.run(); + }); + + afterEach(() => { + if (skip) { + return; + } + paiTrainingService.cleanUp(); + }); + + it('Get PAI token', async () => { + if (skip) { + return; + } + console.log(`paiCluster is ${paiCluster}`) + await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.PAI_CLUSTER_CONFIG, paiCluster); + await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, paiTrialConfig); + try { + const trialDetail = await paiTrainingService.submitTrialJob({jobType : 'TRIAL'}); + chai.expect(trialDetail.status).to.be.equals('WAITING'); + } catch(error) { + console.log('Submit job failed:' + error); + chai.assert(error) + } + }); +}); \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/setup.py b/src/nni_manager/training_service_tool/setup.py new file mode 100644 index 0000000000..a65a79263e --- /dev/null +++ b/src/nni_manager/training_service_tool/setup.py @@ -0,0 +1,20 @@ +import setuptools + +setuptools.setup( + # NNI Training Service(nnits) package + name = 'nnits-tool', + version = '0.0.1', + packages = setuptools.find_packages(), + + python_requires = '>=3.5', + install_requires = [ + 'requests', + 'psutil' + ], + + author = 'Microsoft NNI Team', + author_email = 'nni@microsoft.com', + description = 'NNI Training Service Tool for Neural Network Intelligence project', + license = 'MIT', + url = 'https://github.com/Microsoft/nni' +) \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/__init__.py b/src/nni_manager/training_service_tool/trial/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/nni_manager/training_service_tool/trial/constants.py b/src/nni_manager/training_service_tool/trial/constants.py new file mode 100644 index 0000000000..7f12835cd2 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/constants.py @@ -0,0 +1,37 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os + +API_ROOT_URL = '/api/v1/nni-pai' + +BASE_URL = 'http://localhost' + +DEFAULT_REST_PORT = 51189 + +HOME_DIR = os.path.join(os.environ['HOME'], 'nni') + +LOG_DIR = os.path.join(HOME_DIR, 'trial-keeper', 'log') + +STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout') + +STDERR_FULL_PATH = os.path.join(LOG_DIR, 'stderr') + +UPDATE_METRICS_API = '/update-metrics' \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py new file mode 100644 index 0000000000..b3faff9e74 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -0,0 +1,123 @@ +# ============================================================================================================================== # +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ============================================================================================================================== # + +import argparse +import errno +import json +import os +import re +import requests + +from .constants import DEFAULT_REST_PORT +from .rest_utils import rest_get, rest_post, rest_put, rest_delete +from .url_utils import gen_update_metrics_url + +NNI_SYS_DIR = os.environ['NNI_SYS_DIR'] +NNI_TRIAL_JOB_ID = os.environ['NNI_TRIAL_JOB_ID'] +LEN_FIELD_SIZE = 6 +MAGIC = 'ME' + +print('In metrics_reader, NNI_SYS_DIR is {}'.format(NNI_SYS_DIR)) + +class TrialMetricsReader(): + ''' + Read metrics data from a trial job + ''' + def __init__(self, rest_port = DEFAULT_REST_PORT): + self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset') + self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics') + self.rest_port = rest_port + + def _metrics_file_is_empty(self): + if not os.path.isfile(self.metrics_filename): + return True + statinfo = os.stat(self.metrics_filename) + return statinfo.st_size == 0 + + def _get_offset(self): + offset = 0 + if os.path.isfile(self.offset_filename): + with open(self.offset_filename, 'r') as f: + offset = int(f.readline()) + return offset + + def _write_offset(self, offset): + statinfo = os.stat(self.metrics_filename) + if offset < 0 or offset > statinfo.st_size: + raise ValueError('offset value is invalid: {}'.format(offset)) + + with open(self.offset_filename, 'w') as f: + f.write(str(offset)+'\n') + + def _read_all_available_records(self, offset): + new_offset = offset + metrics = [] + with open(self.metrics_filename, 'r') as f: + print('offset is {}'.format(offset)) + f.seek(offset) + while True: + magic_string = f.read(len(MAGIC)) + # empty data means EOF + if not magic_string: + break + strdatalen = f.read(LEN_FIELD_SIZE) + # empty data means EOF + if not strdatalen: + raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset)) + datalen = int(strdatalen) + data = f.read(datalen) + + if datalen > 0 and len(data) == datalen: + print('data is \'{}\''.format(data)) + new_offset = f.tell() + metrics.append(data) + else: + raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset)) + self._write_offset(new_offset) + return metrics + + def read_trial_metrics(self): + ''' + Read available metrics data for a trial + ''' + if self._metrics_file_is_empty(): + print('metrics is empty') + return [] + + offset = self._get_offset() + return self._read_all_available_records(offset) + +def read_experiment_metrics(): + ''' + Read metrics data for specified trial jobs + ''' + result = {} + try: + reader = TrialMetricsReader() + result['jobId'] = NNI_TRIAL_JOB_ID + result['metrics'] = reader.read_trial_metrics() + print('Result metrics is {}'.format(json.dumps(result))) + response = rest_post(gen_update_metrics_url(DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) + + print('Response code is {}'.format(response.status_code)) + except Exception: + #TODO error logging to file + pass + + return json.dumps(result) \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/rest_utils.py b/src/nni_manager/training_service_tool/trial/rest_utils.py new file mode 100644 index 0000000000..f506653c4e --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/rest_utils.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +import time +import requests + +def rest_get(url, timeout): + '''Call rest get method''' + try: + response = requests.get(url, timeout=timeout) + return response + except Exception: + return None + +def rest_post(url, data, timeout): + '''Call rest post method''' + try: + response = requests.post(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ + data=data, timeout=timeout) + return response + except Exception: + return None + +def rest_put(url, data, timeout): + '''Call rest put method''' + try: + response = requests.put(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ + data=data, timeout=timeout) + return response + except Exception: + return None + +def rest_delete(url, timeout): + '''Call rest delete method''' + try: + response = requests.delete(url, timeout=timeout) + return response + except Exception: + return None diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py new file mode 100644 index 0000000000..4623562200 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -0,0 +1,73 @@ +# ============================================================================================================================== # +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ============================================================================================================================== # + +import argparse +import sys +import os +from subprocess import Popen, PIPE +import time +import logging +import shlex + +from .constants import HOME_DIR, LOG_DIR, STDOUT_FULL_PATH, STDERR_FULL_PATH +from .metrics_reader import read_experiment_metrics + +logger = logging.getLogger('trial_keeper') + +def main_loop(args): + '''main loop logic for trial keeper''' + + if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR) + + stdout_file = open(STDOUT_FULL_PATH, 'a+') + stderr_file = open(STDERR_FULL_PATH, 'a+') + print(shlex.split(args.trial_command)) + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior + process = Popen(args.trial_command, shell = True, stdout = stdout_file, stderr = stderr_file) + print('Subprocess pid is {}'.format(process.pid)) + print('Current cwd is {}'.format(os.getcwd())) + while True: + retCode = process.poll() + if retCode is not None: + print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) + break + else: + print('subprocess pid: {} is still alive'.format(process.pid)) + read_experiment_metrics() + time.sleep(2) + +def trial_keeper_help_info(*args): + print('please run --help to see guidance') + +if __name__ == '__main__': + '''NNI Trial Keeper main function''' + PARSER = argparse.ArgumentParser() + PARSER.set_defaults(func=trial_keeper_help_info) + PARSER.add_argument("--trial_command", type=str, help="Command to launch trial process") + args, unknown = PARSER.parse_known_args() + if args.trial_command is None: + exit(1) + + try: + main_loop(args) + except: + print('Exiting by user request') + sys.exit(1) + diff --git a/src/nni_manager/training_service_tool/trial/url_utils.py b/src/nni_manager/training_service_tool/trial/url_utils.py new file mode 100644 index 0000000000..d3d9b61727 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/url_utils.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from .constants import API_ROOT_URL, BASE_URL, UPDATE_METRICS_API + +def gen_update_metrics_url(port, trial_job_id): + '''Generate update trial metrics url''' + return '{0}:{1}{2}{3}/:{4}'.format(BASE_URL, port, API_ROOT_URL, UPDATE_METRICS_API, trial_job_id) \ No newline at end of file diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index 8611053414..b8ca788520 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -224,7 +224,7 @@ accepts@~1.3.5: mime-types "~2.1.18" negotiator "0.6.1" -ajv@^5.1.0: +ajv@^5.1.0, ajv@^5.3.0: version "5.5.2" resolved "https://registry.yarnpkg.com/ajv/-/ajv-5.5.2.tgz#73b5eeca3fab653e3d3f9422b341ad42205dc965" dependencies: @@ -310,6 +310,10 @@ aws4@^1.6.0: version "1.7.0" resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.7.0.tgz#d4d0e9b9dbfca77bf08eeb0a8a471550fe39e289" +aws4@^1.8.0: + version "1.8.0" + resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.8.0.tgz#f0e003d9ca9e7f59c7a508945d7b2ef9a04a542f" + babel-code-frame@^6.22.0: version "6.26.0" resolved "https://registry.yarnpkg.com/babel-code-frame/-/babel-code-frame-6.26.0.tgz#63fd43f7dc1e3bb7ce35947db8fe369a3f58c74b" @@ -364,6 +368,10 @@ buffer-from@^1.0.0, buffer-from@^1.1.0: version "1.1.1" resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.1.tgz#32713bc028f75c02fdb710d7c7bcec1f2c6070ef" +buffer-stream-reader@^0.1.1: + version "0.1.1" + resolved "https://registry.yarnpkg.com/buffer-stream-reader/-/buffer-stream-reader-0.1.1.tgz#ca8bf93631deedd8b8f8c3bb44991cc30951e259" + builtin-modules@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/builtin-modules/-/builtin-modules-1.1.1.tgz#270f076c5a72c02f5b65a47df94c5fe3a278892f" @@ -455,6 +463,12 @@ combined-stream@1.0.6, combined-stream@~1.0.5: dependencies: delayed-stream "~1.0.0" +combined-stream@~1.0.6: + version "1.0.7" + resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.7.tgz#2d1d24317afb8abe95d6d2c0b07b57813539d828" + dependencies: + delayed-stream "~1.0.0" + commander@2.15.1: version "2.15.1" resolved "https://registry.yarnpkg.com/commander/-/commander-2.15.1.tgz#df46e867d0fc2aec66a34662b406a9ccafff5b0f" @@ -635,7 +649,7 @@ extend@2.0.x: version "2.0.2" resolved "https://registry.yarnpkg.com/extend/-/extend-2.0.2.tgz#1b74985400171b85554894459c978de6ef453ab7" -extend@~3.0.1: +extend@^3.0.0, extend@~3.0.1, extend@~3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/extend/-/extend-3.0.2.tgz#f8b1136b4071fbd8eb140aff858b1019ec2915fa" @@ -671,7 +685,7 @@ forever-agent@~0.6.1: version "0.6.1" resolved "https://registry.yarnpkg.com/forever-agent/-/forever-agent-0.6.1.tgz#fbc71f0c41adeb37f96c577ad1ed42d8fdacca91" -form-data@~2.3.1: +form-data@~2.3.1, form-data@~2.3.2: version "2.3.2" resolved "https://registry.yarnpkg.com/form-data/-/form-data-2.3.2.tgz#4970498be604c20c005d4f5c23aecd21d6b49099" dependencies: @@ -763,6 +777,13 @@ har-validator@~5.0.3: ajv "^5.1.0" har-schema "^2.0.0" +har-validator@~5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/har-validator/-/har-validator-5.1.0.tgz#44657f5688a22cfd4b72486e81b3a3fb11742c29" + dependencies: + ajv "^5.3.0" + har-schema "^2.0.0" + has-ansi@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/has-ansi/-/has-ansi-2.0.0.tgz#34f5049ce1ecdf2b0649af3ef24e45ed35416d91" @@ -870,6 +891,10 @@ is-typedarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/is-typedarray/-/is-typedarray-1.0.0.tgz#e479c80858df0c1b11ddda6940f96011fcda4a9a" +is@~0.2.6: + version "0.2.7" + resolved "http://registry.npmjs.org/is/-/is-0.2.7.tgz#3b34a2c48f359972f35042849193ae7264b63562" + isarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" @@ -958,12 +983,22 @@ mime-db@~1.35.0: version "1.35.0" resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.35.0.tgz#0569d657466491283709663ad379a99b90d9ab47" +mime-db@~1.36.0: + version "1.36.0" + resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.36.0.tgz#5020478db3c7fe93aad7bbcc4dcf869c43363397" + mime-types@^2.1.12, mime-types@~2.1.17, mime-types@~2.1.18: version "2.1.19" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.19.tgz#71e464537a7ef81c15f2db9d97e913fc0ff606f0" dependencies: mime-db "~1.35.0" +mime-types@~2.1.19: + version "2.1.20" + resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.20.tgz#930cb719d571e903738520f8470911548ca2cc19" + dependencies: + mime-db "~1.36.0" + mime@1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/mime/-/mime-1.4.1.tgz#121f9ebc49e3766f311a76e1fa1c8003c4b03aa6" @@ -1066,6 +1101,19 @@ node-version@^1.0.0: version "1.2.0" resolved "https://registry.yarnpkg.com/node-version/-/node-version-1.2.0.tgz#34fde3ffa8e1149bd323983479dda620e1b5060d" +node.extend@1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/node.extend/-/node.extend-1.0.8.tgz#bab04379f7383f4587990c9df07b6a7f65db772b" + dependencies: + is "~0.2.6" + object-keys "~0.4.0" + +node.flow@1.2.3: + version "1.2.3" + resolved "https://registry.yarnpkg.com/node.flow/-/node.flow-1.2.3.tgz#e1c44a82aeca8d78b458a77fb3dc642f2eba2649" + dependencies: + node.extend "1.0.8" + nopt@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/nopt/-/nopt-4.0.1.tgz#d0d4685afd5415193c8c7505602d0d17cd64474d" @@ -1101,10 +1149,18 @@ oauth-sign@~0.8.2: version "0.8.2" resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.8.2.tgz#46a6ab7f0aead8deae9ec0565780b7d4efeb9d43" +oauth-sign@~0.9.0: + version "0.9.0" + resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.9.0.tgz#47a7b016baa68b5fa0ecf3dee08a85c679ac6455" + object-assign@^4.0.1, object-assign@^4.1.0: version "4.1.1" resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863" +object-keys@~0.4.0: + version "0.4.0" + resolved "https://registry.yarnpkg.com/object-keys/-/object-keys-0.4.0.tgz#28a6aae7428dd2c3a92f3d95f21335dd204e0336" + on-finished@~2.3.0: version "2.3.0" resolved "https://registry.yarnpkg.com/on-finished/-/on-finished-2.3.0.tgz#20f1336481b083cd75337992a16971aa2d906947" @@ -1199,6 +1255,10 @@ pseudomap@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/pseudomap/-/pseudomap-1.0.2.tgz#f052a28da70e618917ef0a8ac34c1ae5a68286b3" +psl@^1.1.24: + version "1.1.29" + resolved "https://registry.yarnpkg.com/psl/-/psl-1.1.29.tgz#60f580d360170bb722a797cc704411e6da850c67" + punycode@^1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" @@ -1207,7 +1267,7 @@ qs@6.5.1: version "6.5.1" resolved "https://registry.yarnpkg.com/qs/-/qs-6.5.1.tgz#349cdf6eef89ec45c12d7d5eb3fc0c870343a6d8" -qs@~6.5.1: +qs@~6.5.1, qs@~6.5.2: version "6.5.2" resolved "https://registry.yarnpkg.com/qs/-/qs-6.5.2.tgz#cb3ae806e8740444584ef154ce8ee98d403f3e36" @@ -1249,6 +1309,31 @@ reflect-metadata@^0.1.10: version "0.1.12" resolved "https://registry.yarnpkg.com/reflect-metadata/-/reflect-metadata-0.1.12.tgz#311bf0c6b63cd782f228a81abe146a2bfa9c56f2" +request@^2.74.0: + version "2.88.0" + resolved "https://registry.yarnpkg.com/request/-/request-2.88.0.tgz#9c2fca4f7d35b592efe57c7f0a55e81052124fef" + dependencies: + aws-sign2 "~0.7.0" + aws4 "^1.8.0" + caseless "~0.12.0" + combined-stream "~1.0.6" + extend "~3.0.2" + forever-agent "~0.6.1" + form-data "~2.3.2" + har-validator "~5.1.0" + http-signature "~1.2.0" + is-typedarray "~1.0.0" + isstream "~0.1.2" + json-stringify-safe "~5.0.1" + mime-types "~2.1.19" + oauth-sign "~0.9.0" + performance-now "^2.1.0" + qs "~6.5.2" + safe-buffer "^5.1.2" + tough-cookie "~2.4.3" + tunnel-agent "^0.6.0" + uuid "^3.3.2" + request@^2.87.0: version "2.87.0" resolved "https://registry.yarnpkg.com/request/-/request-2.87.0.tgz#32f00235cd08d482b4d0d68db93a829c0ed5756e" @@ -1294,6 +1379,12 @@ rimraf@^2.6.1: dependencies: glob "^7.0.5" +rmdir@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/rmdir/-/rmdir-1.2.0.tgz#4fe0357cb06168c258e73e968093dc4e8a0f3253" + dependencies: + node.flow "1.2.3" + rx@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/rx/-/rx-4.1.0.tgz#a5f13ff79ef3b740fe30aa803fb09f98805d4782" @@ -1510,6 +1601,13 @@ tough-cookie@~2.3.3: dependencies: punycode "^1.4.1" +tough-cookie@~2.4.3: + version "2.4.3" + resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.4.3.tgz#53f36da3f47783b0925afa06ff9f3b165280f781" + dependencies: + psl "^1.1.24" + punycode "^1.4.1" + tree-kill@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/tree-kill/-/tree-kill-1.2.0.tgz#5846786237b4239014f05db156b643212d4c6f36" @@ -1612,7 +1710,7 @@ utils-merge@1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" -uuid@^3.1.0: +uuid@^3.1.0, uuid@^3.3.2: version "3.3.2" resolved "https://registry.yarnpkg.com/uuid/-/uuid-3.3.2.tgz#1b4af4955eb3077c501c23872fc6513811587131" @@ -1628,6 +1726,14 @@ verror@1.10.0: core-util-is "1.0.2" extsprintf "^1.2.0" +webhdfs@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/webhdfs/-/webhdfs-1.2.0.tgz#c41b08ae33944a0220863bfd4b6719b9aaec1d37" + dependencies: + buffer-stream-reader "^0.1.1" + extend "^3.0.0" + request "^2.74.0" + which@^1.2.9: version "1.3.1" resolved "https://registry.yarnpkg.com/which/-/which-1.3.1.tgz#a45043d54f5805316da8d62f9f50918d3da70b0a" From 89830459b57ceb32bcf6dcb7c9a702b5e61a729c Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Wed, 19 Sep 2018 18:37:02 +0800 Subject: [PATCH 02/60] update trial package directory in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9545ead48c..28b70170bc 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def run(self): 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', 'nnicmd': 'tools/nnicmd', - 'trialkeeper': 'src/nni_manager/training_service_tool/trialkeeper' + 'trialkeeper': 'src/nni_manager/training_service_tool/trial' }, python_requires = '>=3.5', install_requires = [ From 248d0eb0068bd4a1dd84053114a610baafd9f65a Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Wed, 19 Sep 2018 18:48:16 +0800 Subject: [PATCH 03/60] Update setup.py package info --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 28b70170bc..6e0782ccdf 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def run(self): 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', 'nnicmd': 'tools/nnicmd', - 'trialkeeper': 'src/nni_manager/training_service_tool/trial' + 'trial': 'src/nni_manager/training_service_tool/trial' }, python_requires = '>=3.5', install_requires = [ From 43fca7651dfa1eb5a713b28efcfc6ee381d7ffd3 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 20 Sep 2018 17:34:52 +0800 Subject: [PATCH 04/60] Update trial keeper module, use IP adress for pai training service machine --- src/nni_manager/common/utils.ts | 18 ++++++++++++++++-- .../training_service_tool/trial/constants.py | 2 +- .../trial/metrics_reader.py | 6 +++--- .../trial/trial_keeper.py | 5 +++-- .../training_service_tool/trial/url_utils.py | 6 +++--- 5 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index ba0650ef28..805d3ac4b0 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -229,5 +229,19 @@ function cleanupUnitTest(): void { Container.restore(ExperimentStartupInfo); } -export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest, - parseArg, cleanupUnitTest, uniqueString, randomSelect }; +/** + * Get IPv4 address of current machine + */ +function getIPV4Address(): string { + let ipv4Address : string = ''; + + for(const item of os.networkInterfaces().eth0) { + if(item.family === 'IPv4') { + ipv4Address = item.address; + } + } + return ipv4Address; +} + +export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, getIPV4Address, + mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; diff --git a/src/nni_manager/training_service_tool/trial/constants.py b/src/nni_manager/training_service_tool/trial/constants.py index 7f12835cd2..7ff3d7847f 100644 --- a/src/nni_manager/training_service_tool/trial/constants.py +++ b/src/nni_manager/training_service_tool/trial/constants.py @@ -22,7 +22,7 @@ API_ROOT_URL = '/api/v1/nni-pai' -BASE_URL = 'http://localhost' +BASE_URL = 'http://{}' DEFAULT_REST_PORT = 51189 diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py index b3faff9e74..6d18c3b92c 100644 --- a/src/nni_manager/training_service_tool/trial/metrics_reader.py +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -24,7 +24,7 @@ import re import requests -from .constants import DEFAULT_REST_PORT +from .constants import BASE_URL, DEFAULT_REST_PORT from .rest_utils import rest_get, rest_post, rest_put, rest_delete from .url_utils import gen_update_metrics_url @@ -103,7 +103,7 @@ def read_trial_metrics(self): offset = self._get_offset() return self._read_all_available_records(offset) -def read_experiment_metrics(): +def read_experiment_metrics(nnimanager_ip): ''' Read metrics data for specified trial jobs ''' @@ -113,7 +113,7 @@ def read_experiment_metrics(): result['jobId'] = NNI_TRIAL_JOB_ID result['metrics'] = reader.read_trial_metrics() print('Result metrics is {}'.format(json.dumps(result))) - response = rest_post(gen_update_metrics_url(DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) + response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) print('Response code is {}'.format(response.status_code)) except Exception: diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 4623562200..07084691bc 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -50,7 +50,7 @@ def main_loop(args): break else: print('subprocess pid: {} is still alive'.format(process.pid)) - read_experiment_metrics() + read_experiment_metrics(args.nnimanager_ip) time.sleep(2) def trial_keeper_help_info(*args): @@ -60,7 +60,8 @@ def trial_keeper_help_info(*args): '''NNI Trial Keeper main function''' PARSER = argparse.ArgumentParser() PARSER.set_defaults(func=trial_keeper_help_info) - PARSER.add_argument("--trial_command", type=str, help="Command to launch trial process") + PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') + PARSER.add_argument('--nnimanager_ip', type=str, default='localhost', help='NNI manager IP') args, unknown = PARSER.parse_known_args() if args.trial_command is None: exit(1) diff --git a/src/nni_manager/training_service_tool/trial/url_utils.py b/src/nni_manager/training_service_tool/trial/url_utils.py index d3d9b61727..de5b424f07 100644 --- a/src/nni_manager/training_service_tool/trial/url_utils.py +++ b/src/nni_manager/training_service_tool/trial/url_utils.py @@ -18,8 +18,8 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -from .constants import API_ROOT_URL, BASE_URL, UPDATE_METRICS_API +from .constants import API_ROOT_URL, UPDATE_METRICS_API -def gen_update_metrics_url(port, trial_job_id): +def gen_update_metrics_url(base_url, port, trial_job_id): '''Generate update trial metrics url''' - return '{0}:{1}{2}{3}/:{4}'.format(BASE_URL, port, API_ROOT_URL, UPDATE_METRICS_API, trial_job_id) \ No newline at end of file + return '{0}:{1}{2}{3}/:{4}'.format(base_url, port, API_ROOT_URL, UPDATE_METRICS_API, trial_job_id) \ No newline at end of file From 4fe49dedb3cb114c8de28e5f6be29774672713c1 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 20 Sep 2018 18:34:49 +0800 Subject: [PATCH 05/60] Update metrics file path in reader --- src/nni_manager/training_service_tool/trial/metrics_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py index 6d18c3b92c..03a9cd8abc 100644 --- a/src/nni_manager/training_service_tool/trial/metrics_reader.py +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -40,8 +40,8 @@ class TrialMetricsReader(): Read metrics data from a trial job ''' def __init__(self, rest_port = DEFAULT_REST_PORT): - self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset') - self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics') + self.offset_filename = os.path.join('.nni', 'metrics_offset') + self.metrics_filename = os.path.join('.nni', 'metrics') self.rest_port = rest_port def _metrics_file_is_empty(self): From 66a54e1e41cb1d76fa120921aed8bc3f5cf7f0c5 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 21 Sep 2018 09:43:51 +0800 Subject: [PATCH 06/60] Fix metrics file path issue --- src/nni_manager/training_service_tool/trial/metrics_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py index 03a9cd8abc..6d18c3b92c 100644 --- a/src/nni_manager/training_service_tool/trial/metrics_reader.py +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -40,8 +40,8 @@ class TrialMetricsReader(): Read metrics data from a trial job ''' def __init__(self, rest_port = DEFAULT_REST_PORT): - self.offset_filename = os.path.join('.nni', 'metrics_offset') - self.metrics_filename = os.path.join('.nni', 'metrics') + self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset') + self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics') self.rest_port = rest_port def _metrics_file_is_empty(self): From 65709d368366ad33870fda60b2fb4772d4de7e11 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 21 Sep 2018 18:30:45 +0800 Subject: [PATCH 07/60] Update pai integration, full implementation of pai training service --- .../training_service/pai/paiData.ts | 12 +- .../pai/paiJobInfoCollector.ts | 136 ++++++++++++++++++ .../pai/paiTrainingService.ts | 88 +++++++++--- src/sdk/pynni/nni/platform/__init__.py | 2 +- 4 files changed, 212 insertions(+), 26 deletions(-) create mode 100644 src/nni_manager/training_service/pai/paiJobInfoCollector.ts diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 0270ef0e12..2c8bb6b868 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -24,6 +24,7 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from 'common/trai export class PAITrialJobDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; + public paiJobName: string; public submitTime: number; public startTime?: number; public endTime?: number; @@ -32,9 +33,11 @@ export class PAITrialJobDetail implements TrialJobDetail { public workingDirectory: string; public form: JobApplicationForm; - constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: JobApplicationForm) { + constructor(id: string, status: TrialJobStatus, paiJobName : string, + submitTime: number, workingDirectory: string, form: JobApplicationForm) { this.id = id; this.status = status; + this.paiJobName = paiJobName; this.submitTime = submitTime; this.workingDirectory = workingDirectory; this.form = form; @@ -43,6 +46,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} -&& cd $NNI_SYS_DIR -&& python3 -m trial.trial_keeper --trial_command '{2}'`; \ No newline at end of file +`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} +&& cd $NNI_SYS_DIR && mkdir .nni +&& python3 -m trial.trial_keeper --trial_command '{2}' --nnimanager_ip '{3}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts new file mode 100644 index 0000000000..f347205b80 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -0,0 +1,136 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as request from 'request'; +import { EventEmitter } from 'events'; +import { Deferred } from 'ts-deferred'; +import { getLogger, Logger } from '../../common/log'; +import { NNIError, NNIErrorNames } from '../../common/errors'; +import { PAITrialJobDetail } from './paiData'; +import { PAIClusterConfig } from './paiConfig'; +import { TrialJobStatus } from '../../common/trainingService'; + +/** + * Collector PAI jobs info from PAI cluster, and update pai job status locally + */ +export class PAIJobInfoCollector { + private readonly trialJobsMap : Map; + private readonly log: Logger = getLogger(); + private readonly statusesNeedToCheck : TrialJobStatus[]; + private readonly finalStatuses : TrialJobStatus[]; + + constructor(jobMap: Map) { + this.trialJobsMap = jobMap; + this.statusesNeedToCheck = ['RUNNING', 'UNKNOWN', 'WAITING']; + this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED']; + } + + public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise { + if (!paiClusterConfig || !paiToken) { + return Promise.resolve(); + } + + const updatePaiTrialJobs : Promise[] = []; + for(let [trialJobId, paiTrialJob] of this.trialJobsMap) { + if (!paiTrialJob) { + throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); + } + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig)) + } + + await Promise.all(updatePaiTrialJobs); + } + + private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig) : Promise { + const deferred : Deferred = new Deferred(); + if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { + deferred.resolve(); + return deferred.promise; + } + + // Rest call to get PAI job info and update status + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const getJobInfoRequest: request.Options = { + uri: `http://${paiClusterConfig.host}:9186/api/v1/jobs/${paiTrialJob.paiJobName}`, + method: 'GET', + json: true, + headers: { + "Content-Type": "application/json", + "Authorization": 'Bearer ' + paiToken + } + }; + //TODO : pass in request timeout param? + request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => { + if (error || response.statusCode >= 500) { + this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); + // Queried PAI job info failed, set job status to UNKNOWN + if(paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { + paiTrialJob.status = 'UNKNOWN'; + } + } else { + if(response.body.jobStatus && response.body.jobStatus.state) { + console.log(`*****IN getSinglePAITrialJobInfo: response body state is ${response.body.jobStatus.state}`); + switch(response.body.jobStatus.state) { + case 'WAITING': + paiTrialJob.status = 'WAITING'; + break; + case 'RUNNING': + paiTrialJob.status = 'RUNNING'; + if(!paiTrialJob.startTime) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + } + if(!paiTrialJob.url) { + paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + } + break; + case 'SUCCEEDED': + paiTrialJob.status = 'SUCCEEDED'; + break; + case 'STOPPED': + paiTrialJob.status = 'USER_CANCELED'; + break; + case 'FAILED': + paiTrialJob.status = 'FAILED'; + break; + default: + paiTrialJob.status = 'UNKNOWN'; + break; + } + // For final job statues, update startTime, endTime and url + if(this.finalStatuses.includes(paiTrialJob.status)) { + if(!paiTrialJob.startTime) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + } + if(!paiTrialJob.endTime) { + paiTrialJob.endTime = response.body.jobStatus.completedTime; + } + if(!paiTrialJob.url) { + paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + } + } + } + } + deferred.resolve(); + }); + + return deferred.promise; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index a72900465e..1764f3828a 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -20,8 +20,9 @@ 'use strict' -import * as assert from 'assert'; import * as component from '../../common/component'; +import * as cpp from 'child-process-promise'; +import * as fs from 'fs'; import * as path from 'path'; import * as request from 'request'; @@ -34,10 +35,11 @@ import { HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; +import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { ObservableTimer } from '../../common/observableTimer'; import { PAIJobRestServer } from './paiJobRestServer' import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; +import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { String } from 'typescript-string-operations'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; import { HDFSClientUtility } from './hdfsClientUtility' @@ -62,34 +64,52 @@ class PAITrainingService implements TrainingService { private hdfsClient: any; private paiToken? : string; private experimentId! : string; + private readonly paiJobCollector : PAIJobInfoCollector; - constructor(@component.Inject timer: ObservableTimer) { + constructor() { this.log = getLogger(); this.metricsEmitter = new EventEmitter(); this.trialJobsMap = new Map(); // Root dir on HDFS this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); - this.experimentId = getExperimentId(); + this.experimentId = getExperimentId(); + this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); } public async run(): Promise { const restServer: PAIJobRestServer = component.get(PAIJobRestServer); await restServer.start(); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); + while (!this.stopping) { + await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig); + await delay(3000); + } } - public listTrialJobs(): Promise { - const deferred : Deferred = new Deferred(); + public async listTrialJobs(): Promise { + const jobs: TrialJobDetail[] = []; + + this.trialJobsMap.forEach(async (value: PAITrialJobDetail, key: string) => { + if (value.form.jobType === 'TRIAL') { + jobs.push(await this.getTrialJob(key)); + } + }); - deferred.resolve([]); - return deferred.promise; + return Promise.resolve(jobs); } public getTrialJob(trialJobId: string): Promise { - const deferred : Deferred = new Deferred(); + if(!this.paiClusterConfig) { + throw new Error('PAI Cluster config is not initialized'); + } - deferred.resolve(undefined); - return deferred.promise; + const paiTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (!paiTrialJob) { + return Promise.reject(`trial job ${trialJobId} not found`) + } + + return Promise.resolve(paiTrialJob); } public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) { @@ -118,23 +138,37 @@ class PAITrainingService implements TrainingService { //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); + const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); + //create tmp trial working folder locally. + await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); + await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`); + + // Write file content ( parameter.cfg ) to local tmp folders + const trialForm : TrialJobApplicationForm = (form) + if(trialForm) { + await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), trialForm.hyperParameters, { encoding: 'utf8' }); + } + + // Step 1. Prepare PAI job configuration + const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', + paiJobName, Date.now(), trialWorkingFolder, form); this.trialJobsMap.set(trialJobId, trialJobDetail); - // Step 1. Prepare PAI job configuration - const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; - const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); - const nniPaiTrialCommand : string = String.Format( PAI_TRIAL_COMMAND_FORMAT, - `./${trialJobId}`, + // PAI will copy job's codeDir into /root directory + `/root/${trialJobId}`, trialJobId, - this.paiTrialConfig.command + this.paiTrialConfig.command, + getIPV4Address() ).replace(/\r\n|\n|\r/gm, ''); console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); @@ -165,10 +199,11 @@ class PAITrainingService implements TrainingService { // PAI Task roles paiTaskRoles); console.log(`PAI job config is ${JSON.stringify(paiJobConfig)}`); + console.log(`Before submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); // Step 2. Upload code files in codeDir onto HDFS try { - await HDFSClientUtility.copyDirectoryToHdfs(this.paiTrialConfig.codeDir, hdfsCodeDir, this.hdfsClient); + await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient); } catch (error) { this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); throw new Error(error.message); @@ -187,11 +222,13 @@ class PAITrainingService implements TrainingService { } }; request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + console.log(`After submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); if (error || response.statusCode >= 400) { this.log.error(`PAI Training service: Submit trial ${trialJobId} to PAI Cluster failed!`); trialJobDetail.status = 'FAILED'; deferred.reject(error ? error.message : 'Submit trial failed, http code: ' + response.statusCode); } else { + trialJobDetail.submitTime = Date.now(); deferred.resolve(trialJobDetail); } }); @@ -282,14 +319,23 @@ class PAITrainingService implements TrainingService { return deferred.promise; } - public cleanUp(): Promise { + public async cleanUp(): Promise { + this.stopping = true; + const deferred : Deferred = new Deferred(); + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + try { + await restServer.stop(); + deferred.resolve(); + this.log.info('PAI Training service rest server stopped successfully.'); + } catch (error) { + this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); + deferred.reject(error); + } - deferred.resolve(); return deferred.promise; } - public get MetricsEmitter() : EventEmitter { return this.metricsEmitter; } diff --git a/src/sdk/pynni/nni/platform/__init__.py b/src/sdk/pynni/nni/platform/__init__.py index e0b44e49cb..fed452fc47 100644 --- a/src/sdk/pynni/nni/platform/__init__.py +++ b/src/sdk/pynni/nni/platform/__init__.py @@ -27,7 +27,7 @@ from .standalone import * elif env_args.platform == 'unittest': from .test import * -elif env_args.platform in ('local', 'remote'): +elif env_args.platform in ('local', 'remote', 'pai'): from .local import * else: raise RuntimeError('Unknown platform %s' % env_args.platform) From c1a3d34fcf76233bcc20e40af485cc04a9ba28e8 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 21 Sep 2018 19:56:50 +0800 Subject: [PATCH 08/60] Do not send metrics if it is empty --- .../training_service_tool/trial/metrics_reader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py index 6d18c3b92c..1d347b5548 100644 --- a/src/nni_manager/training_service_tool/trial/metrics_reader.py +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -113,9 +113,9 @@ def read_experiment_metrics(nnimanager_ip): result['jobId'] = NNI_TRIAL_JOB_ID result['metrics'] = reader.read_trial_metrics() print('Result metrics is {}'.format(json.dumps(result))) - response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) - - print('Response code is {}'.format(response.status_code)) + if len(result['metrics']) > 0: + response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) + print('Response code is {}'.format(response.status_code)) except Exception: #TODO error logging to file pass From 232d0e8fe608abfbb435ef9a83979ddbfe71ec33 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Fri, 21 Sep 2018 20:02:08 +0800 Subject: [PATCH 09/60] Update nnictl, to support pai configuration --- tools/nnicmd/config_schema.py | 42 +++++++------------------- tools/nnicmd/launcher.py | 55 ++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index 8cd8431151..86164f6a88 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -50,7 +50,12 @@ 'trial':{ 'command': str, 'codeDir': os.path.exists, - 'gpuNum': And(int, lambda x: 0 <= x <= 99999) + 'gpuNum': And(int, lambda x: 0 <= x <= 99999), + Optional('cpuNum'): And(int, lambda x: 0 <= x <= 99999), + Optional('memoryMB'): int, + Optional('image'): str, + Optional('dataDir'): str, + Optional('outputDir'): str }, Optional('assessor'): Or({ 'builtinAssessorName': lambda x: x in ['Medianstop'], @@ -77,36 +82,9 @@ 'sshKeyPath': os.path.exists, Optional('passphrase'): str })], -Optional('pai'): -{ - 'jobName': str, - "image": str, - "authFile": os.path.exists, - "dataDir": os.path.exists, - "outputDir": os.path.exists, - "codeDir": os.path.exists, - "virtualCluster": str, - "taskRoles": [ - { - "name": str, - "taskNumber": And(int, lambda x: 0 <= x <= 99999), - "cpuNumber": And(int, lambda x: 0 <= x <= 99999), - "memoryMB": And(int, lambda x: 0 <= x <= 99999), - "shmMB": And(int, lambda x: 0 <= x <= 99999), - "gpuNumber": And(int, lambda x: 0 <= x <= 99999), - "portList": [ - { - "label": str, - "beginAt": str, - "portNumber": And(int, lambda x: 0 < x < 65535) - } - ], - "command": str, - "minFailedTaskCount": And(int, lambda x: 0 <= x <= 99999), - "minSucceededTaskCount": And(int, lambda x: 0 <= x <= 99999) - } - ], - "gpuType": str, - "retryCount": And(int, lambda x: 0 <= x <= 99999) +Optional('paiConfig'):{ + 'userName': str, + 'passWord': str, + 'host': str } }) \ No newline at end of file diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index b99a428b1d..25539693ad 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -64,6 +64,16 @@ def set_trial_config(experiment_config, port): value_dict['command'] = experiment_config['trial']['command'] value_dict['codeDir'] = experiment_config['trial']['codeDir'] value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] + if experiment_config['trial'].get('cpuNum'): + value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] + if experiment_config['trial'].get('memoryMB'): + value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] + if experiment_config['trial'].get('image'): + value_dict['image'] = experiment_config['trial']['image'] + if experiment_config['trial'].get('dataDir'): + value_dict['dataDir'] = experiment_config['trial']['dataDir'] + if experiment_config['trial'].get('outputDir'): + value_dict['outputDir'] = experiment_config['trial']['outputDir'] request_data['trial_config'] = value_dict response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) return True if response.status_code == 200 else False @@ -87,6 +97,20 @@ def set_remote_config(experiment_config, port): #set trial_config return set_trial_config(experiment_config, port), err_message +def set_pai_config(experiment_config, port): + '''set pai configuration''' + pai_config_data = dict() + pai_config_data['pai_config'] = experiment_config['paiConfig'] + response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), 20) + err_message = '' + if not response or not response.status_code == 200: + if response is not None: + err_message = response.text + return False, err_message + + #set trial_config + return set_trial_config(experiment_config, port), err_message + def set_experiment(experiment_config, mode, port): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() @@ -106,7 +130,7 @@ def set_experiment(experiment_config, mode, port): {'key':'codeDir', 'value':experiment_config['trial']['codeDir']}) request_data['clusterMetaData'].append( {'key': 'command', 'value': experiment_config['trial']['command']}) - else: + elif experiment_config['trainingServicePlatform'] == 'remote': request_data['clusterMetaData'].append( {'key': 'machine_list', 'value': experiment_config['machineList']}) value_dict = dict() @@ -115,6 +139,20 @@ def set_experiment(experiment_config, mode, port): value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': value_dict}) + elif experiment_config['trainingServicePlatform'] == 'pai': + request_data['clusterMetaData'].append( + {'key': 'pai_config', 'value': experiment_config['paiConfig']}) + value_dict = dict() + value_dict['command'] = experiment_config['trial']['command'] + value_dict['codeDir'] = experiment_config['trial']['codeDir'] + value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] + value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] + value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] + value_dict['image'] = experiment_config['trial']['image'] + value_dict['dataDir'] = experiment_config['trial']['dataDir'] + value_dict['outputDir'] = experiment_config['trial']['outputDir'] + request_data['clusterMetaData'].append( + {'key': 'trial_config', 'value': value_dict}) response = rest_post(experiment_url(port), json.dumps(request_data), 20) return response if response.status_code == 200 else None @@ -183,6 +221,21 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No except Exception: raise Exception(ERROR_INFO % 'Rest server stopped!') exit(0) + + #set pai config + if experiment_config['trainingServicePlatform'] == 'pai': + print_normal('Setting pai config...') + config_result, err_msg = set_pai_config(experiment_config, REST_PORT) + if config_result: + print_normal('Success!') + else: + print_error('Failed! Error is: {}'.format(err_msg)) + try: + cmds = ['pkill', '-P', str(rest_process.pid)] + call(cmds) + except Exception: + raise Exception(ERROR_INFO % 'Rest server stopped!') + exit(0) # start a new experiment print_normal('Starting experiment...') From a5d4a2088f56e0cb5ac5e8875ca2766cf3da8b57 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 12:56:27 +0800 Subject: [PATCH 10/60] fix repo --- src/nni_manager/training_service/pai/paiData.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 2c8bb6b868..620aca3d3b 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -46,7 +46,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai +`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} && cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial.trial_keeper --trial_command '{2}' --nnimanager_ip '{3}'`; \ No newline at end of file From cd64e5fe51c97376e88006432a81b1126fe40eb1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 13:48:29 +0800 Subject: [PATCH 11/60] add hdfs_output_dir --- src/nni_manager/training_service/pai/paiData.ts | 4 ++-- src/nni_manager/training_service/pai/paiTrainingService.ts | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 620aca3d3b..e7a6a062a1 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -47,6 +47,6 @@ export class PAITrialJobDetail implements TrialJobDetail { export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 -&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} NNI_HDFS_OUTPUT_DIR={2} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{2}' --nnimanager_ip '{3}'`; \ No newline at end of file +&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 1764f3828a..eb055cee90 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -167,6 +167,7 @@ class PAITrainingService implements TrainingService { // PAI will copy job's codeDir into /root directory `/root/${trialJobId}`, trialJobId, + this.paiTrialConfig.outputDir, this.paiTrialConfig.command, getIPV4Address() ).replace(/\r\n|\n|\r/gm, ''); From 889e066f9a8f3332c465a4769faebabc7e45b05b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 14:13:34 +0800 Subject: [PATCH 12/60] add copy logic --- .../trial/hdfsClientUtility.py | 58 +++++++++++++++++++ .../trial/trial_keeper.py | 21 +++++++ 2 files changed, 79 insertions(+) create mode 100644 src/nni_manager/training_service_tool/trial/hdfsClientUtility.py diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py new file mode 100644 index 0000000000..167f28a466 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os + +def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): + '''Copy directory from local to hdfs''' + if not os.path.exists(localDirectory): + raise Exception('Local Directory does not exist!') + if not pathExists(hdfsDirectory, hdfsClient): + hdfsClient.makedirs(hdfsDirectory) + try: + for file in os.listdir(localDirectory): + file_path = os.path.join(localDirectory, file) + if os.path.isdir(file_path): + copyDirectoryToHdfs(file_path, hdfsDirectory, hdfsClient) + else: + copyFileToHdfs(localDirectory, hdfsDirectory, hdfsClient) + return True + except: + return False + +def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient): + '''Copy a local file to hdfs directory''' + if not os.path.exists(localFilePath): + raise Exception('Local file Path does not exist!') + if not pathExists(hdfsFilePath, hdfsClient): + hdfsClient.makedirs(hdfsFilePath) + try: + hdfsClient.upload(hdfsFilePath, localFilePath, overwrite = True) + except: + return False + return True + +def pathExists(hdfsPath, hdfsClient): + '''Check if an HDFS path already exists''' + result = hdfsClient.status(hdfsPath, strict=False) + if result is not None: + return True + else: + return False diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 07084691bc..117af6bcd3 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -24,7 +24,10 @@ import time import logging import shlex +import re +import hdfs +from .hdfsClientUtility import copyDirectoryToHdfs from .constants import HOME_DIR, LOG_DIR, STDOUT_FULL_PATH, STDERR_FULL_PATH from .metrics_reader import read_experiment_metrics @@ -47,6 +50,24 @@ def main_loop(args): retCode = process.poll() if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) + if 'NNI_OUTPUT_DIR' in os.environ and 'NNI_HDFS_OUTPUT_DIR' in os.environ: + local_directory = os.environ['NNI_OUTPUT_DIR'] + hdfs_output_dir = os.environ['NNI_HDFS_OUTPUT_DIR'] + #get hdfs_host and hdfs_directory + hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' + hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) + hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') + #get url_host + url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' + url_host = re.findall(url_host_pattern, hdfs_host[0]) + #init hdfs client + print(url_host, local_directory, hdfs_directory) + hdfs_client = hdfs.Client('http://{0}:{1}'.format(url_host, 50070)) + + if copyDirectoryToHdfs(local_directory, hdfs_directory, hdfs_client): + print('copy directory success!') + else: + print('copy directory failed!') break else: print('subprocess pid: {} is still alive'.format(process.pid)) From de9c37400fe1f908e7f5f75ebae34e1a468e426f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 14:46:05 +0800 Subject: [PATCH 13/60] debug --- src/nni_manager/training_service_tool/trial/trial_keeper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 117af6bcd3..376418ce9f 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -63,11 +63,12 @@ def main_loop(args): #init hdfs client print(url_host, local_directory, hdfs_directory) hdfs_client = hdfs.Client('http://{0}:{1}'.format(url_host, 50070)) - + ''' if copyDirectoryToHdfs(local_directory, hdfs_directory, hdfs_client): print('copy directory success!') else: print('copy directory failed!') + ''' break else: print('subprocess pid: {} is still alive'.format(process.pid)) From e98b0accfc1a2325e57a2efc124d94be09c2f8cd Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 20:12:25 +0800 Subject: [PATCH 14/60] update hdfsUtility --- .../training_service/pai/paiData.ts | 4 +- .../pai/paiTrainingService.ts | 1 + .../training_service_tool/setup.py | 3 +- .../trial/hdfsClientUtility.py | 52 ++++++++++++------- .../trial/trial_keeper.py | 20 ++++--- tools/nnicmd/launcher.py | 7 +-- tools/nnicmd/webui_utils.py | 4 ++ 7 files changed, 59 insertions(+), 32 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index e7a6a062a1..d1c36a09ec 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -47,6 +47,6 @@ export class PAITrialJobDetail implements TrialJobDetail { export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 -&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} NNI_HDFS_OUTPUT_DIR={2} +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} NNI_HDFS_OUTPUT_DIR={2} NNI_USER_NAME={3} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}'`; \ No newline at end of file +&& python3 -m trial.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index eb055cee90..fbf332fb22 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -168,6 +168,7 @@ class PAITrainingService implements TrainingService { `/root/${trialJobId}`, trialJobId, this.paiTrialConfig.outputDir, + this.paiClusterConfig.userName, this.paiTrialConfig.command, getIPV4Address() ).replace(/\r\n|\n|\r/gm, ''); diff --git a/src/nni_manager/training_service_tool/setup.py b/src/nni_manager/training_service_tool/setup.py index a65a79263e..bd22724571 100644 --- a/src/nni_manager/training_service_tool/setup.py +++ b/src/nni_manager/training_service_tool/setup.py @@ -9,7 +9,8 @@ python_requires = '>=3.5', install_requires = [ 'requests', - 'psutil' + 'psutil', + 'pyhdfs' ], author = 'Microsoft NNI Team', diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 167f28a466..6168896095 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -18,41 +18,57 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + import os +from pyhdfs import HdfsClient def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): '''Copy directory from local to hdfs''' if not os.path.exists(localDirectory): raise Exception('Local Directory does not exist!') - if not pathExists(hdfsDirectory, hdfsClient): - hdfsClient.makedirs(hdfsDirectory) try: + hdfsClient.mkdirs(hdfsDirectory) for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) if os.path.isdir(file_path): - copyDirectoryToHdfs(file_path, hdfsDirectory, hdfsClient) + hdfs_directory = os.path.join(hdfsDirectory, file) + copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) else: - copyFileToHdfs(localDirectory, hdfsDirectory, hdfsClient) + hdfs_file_path = os.path.join(hdfsDirectory, file) + copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) return True except: return False -def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient): +def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): '''Copy a local file to hdfs directory''' if not os.path.exists(localFilePath): raise Exception('Local file Path does not exist!') - if not pathExists(hdfsFilePath, hdfsClient): - hdfsClient.makedirs(hdfsFilePath) + if hdfsClient.exists(hdfsFilePath): + if override: + hdfsClient.delete(hdfsFilePath) + else: + return False try: - hdfsClient.upload(hdfsFilePath, localFilePath, overwrite = True) + return client.copy_from_local(localFilePath, hdfsFilePath) except: - return False - return True - -def pathExists(hdfsPath, hdfsClient): - '''Check if an HDFS path already exists''' - result = hdfsClient.status(hdfsPath, strict=False) - if result is not None: - return True - else: - return False + return False \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 376418ce9f..8c8d67a143 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -25,7 +25,7 @@ import logging import shlex import re -import hdfs +from pyhdfs import HdfsClient from .hdfsClientUtility import copyDirectoryToHdfs from .constants import HOME_DIR, LOG_DIR, STDOUT_FULL_PATH, STDERR_FULL_PATH @@ -50,9 +50,10 @@ def main_loop(args): retCode = process.poll() if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) - if 'NNI_OUTPUT_DIR' in os.environ and 'NNI_HDFS_OUTPUT_DIR' in os.environ: - local_directory = os.environ['NNI_OUTPUT_DIR'] + if 'NNI_OUTPUT_DIR' in os.environ and 'NNI_HDFS_OUTPUT_DIR' in os.environ and 'NNI_USER_NAME' in os.environ: + local_directory = os.environ['NNI_OUTPUT_DIR'] hdfs_output_dir = os.environ['NNI_HDFS_OUTPUT_DIR'] + nni_user_name = os.environ['NNI_USER_NAME'] #get hdfs_host and hdfs_directory hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) @@ -61,14 +62,17 @@ def main_loop(args): url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' url_host = re.findall(url_host_pattern, hdfs_host[0]) #init hdfs client - print(url_host, local_directory, hdfs_directory) - hdfs_client = hdfs.Client('http://{0}:{1}'.format(url_host, 50070)) - ''' - if copyDirectoryToHdfs(local_directory, hdfs_directory, hdfs_client): + if not os.path.isdir(local_directory): + raise Exception('Local Directory Error!') + #get local folder name + local_folder_name = local_directory.replace(os.path.dirname(local_directory), '')[1:] + hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) + print(local_directory, hdfs_output_dir_full) + if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): print('copy directory success!') else: print('copy directory failed!') - ''' break else: print('subprocess pid: {} is still alive'.format(process.pid)) diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 25539693ad..5787627a6d 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -261,9 +261,10 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No else: print_normal('Starting web ui...') webui_process = start_web_ui(webuiport) - nni_config.set_config('webuiPid', webui_process.pid) - print_normal('Starting web ui success!') - print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) + if webui_process: + nni_config.set_config('webuiPid', webui_process.pid) + print_normal('Starting web ui success!') + print_normal('{0} {1}'.format('Web UI url:', ' '.join(nni_config.get_config('webuiUrl')))) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, REST_PORT)) diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 1121452a08..c7e4264bce 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -35,6 +35,10 @@ def start_web_ui(port): cmds = [serve, '-s', '-n', web_ui, '-l', str(port)] stdout_file = open(STDOUT_FULL_PATH, 'a+') stderr_file = open(STDERR_FULL_PATH, 'a+') + print(cmds) + if not serve or not web_ui: + print_error('Failed to start webui!') + return None webui_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) if webui_process.returncode is None: webui_url_list = [] From 272411a2e097ac9896fa220e340e2d5befa3b925 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 20:27:51 +0800 Subject: [PATCH 15/60] debug --- .../training_service_tool/trial/trial_keeper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 8c8d67a143..9fd362a859 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -69,10 +69,10 @@ def main_loop(args): hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) print(local_directory, hdfs_output_dir_full) - if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): - print('copy directory success!') - else: - print('copy directory failed!') + # if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): + # print('copy directory success!') + # else: + # print('copy directory failed!') break else: print('subprocess pid: {} is still alive'.format(process.pid)) From 4cba4d135e97814e0b090c5f2be0279a7a880ce6 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 20:39:05 +0800 Subject: [PATCH 16/60] debug --- .../trial/trial_keeper.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 9fd362a859..adc5b28c8e 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -54,21 +54,21 @@ def main_loop(args): local_directory = os.environ['NNI_OUTPUT_DIR'] hdfs_output_dir = os.environ['NNI_HDFS_OUTPUT_DIR'] nni_user_name = os.environ['NNI_USER_NAME'] - #get hdfs_host and hdfs_directory - hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' - hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) - hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') - #get url_host - url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' - url_host = re.findall(url_host_pattern, hdfs_host[0]) - #init hdfs client - if not os.path.isdir(local_directory): - raise Exception('Local Directory Error!') - #get local folder name - local_folder_name = local_directory.replace(os.path.dirname(local_directory), '')[1:] - hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) - print(local_directory, hdfs_output_dir_full) + # #get hdfs_host and hdfs_directory + # hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' + # hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) + # hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') + # #get url_host + # url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' + # url_host = re.findall(url_host_pattern, hdfs_host[0]) + # #init hdfs client + # if not os.path.isdir(local_directory): + # raise Exception('Local Directory Error!') + # #get local folder name + # local_folder_name = local_directory.replace(os.path.dirname(local_directory), '')[1:] + # hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) + # hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) + # print(local_directory, hdfs_output_dir_full) # if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): # print('copy directory success!') # else: From 45d103173e3285bb0516ea5550eda45838d34904 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 20:49:47 +0800 Subject: [PATCH 17/60] fix setup.py bug --- setup.py | 3 +- .../trial/trial_keeper.py | 38 +++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/setup.py b/setup.py index 6e0782ccdf..fb57f247ef 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,8 @@ def run(self): 'pyyaml', 'requests', 'scipy', - 'schema' + 'schema', + 'pyhdfs' ], dependency_links = [ 'git+https://github.com/hyperopt/hyperopt.git' diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index adc5b28c8e..8c8d67a143 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -54,25 +54,25 @@ def main_loop(args): local_directory = os.environ['NNI_OUTPUT_DIR'] hdfs_output_dir = os.environ['NNI_HDFS_OUTPUT_DIR'] nni_user_name = os.environ['NNI_USER_NAME'] - # #get hdfs_host and hdfs_directory - # hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' - # hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) - # hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') - # #get url_host - # url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' - # url_host = re.findall(url_host_pattern, hdfs_host[0]) - # #init hdfs client - # if not os.path.isdir(local_directory): - # raise Exception('Local Directory Error!') - # #get local folder name - # local_folder_name = local_directory.replace(os.path.dirname(local_directory), '')[1:] - # hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) - # hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) - # print(local_directory, hdfs_output_dir_full) - # if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): - # print('copy directory success!') - # else: - # print('copy directory failed!') + #get hdfs_host and hdfs_directory + hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' + hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) + hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') + #get url_host + url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' + url_host = re.findall(url_host_pattern, hdfs_host[0]) + #init hdfs client + if not os.path.isdir(local_directory): + raise Exception('Local Directory Error!') + #get local folder name + local_folder_name = local_directory.replace(os.path.dirname(local_directory), '')[1:] + hdfs_output_dir_full = os.path.join(hdfs_directory, local_folder_name) + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) + print(local_directory, hdfs_output_dir_full) + if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): + print('copy directory success!') + else: + print('copy directory failed!') break else: print('subprocess pid: {} is still alive'.format(process.pid)) From e63ffc00ad55b12bf6edd7077ab304ff5249953d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 21:11:37 +0800 Subject: [PATCH 18/60] fix bug --- .../training_service_tool/trial/hdfsClientUtility.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 6168896095..80d50310b7 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -54,7 +54,8 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) else: hdfs_file_path = os.path.join(hdfsDirectory, file) - copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) + if not copyFileToHdfs(file_path, hdfs_file_path, hdfsClient): + return False return True except: return False @@ -69,6 +70,6 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): else: return False try: - return client.copy_from_local(localFilePath, hdfsFilePath) + return hdfsClient.copy_from_local(localFilePath, hdfsFilePath) except: return False \ No newline at end of file From 954d640a05a70174d2a366f08a17be9151f6e81b Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 21:35:02 +0800 Subject: [PATCH 19/60] debug --- tools/nnicmd/webui_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index c7e4264bce..19aed74e36 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -33,6 +33,7 @@ def start_web_ui(port): serve = os.environ.get('NNI_SERVE', 'serve') web_ui = os.environ.get('WEB_UI_FOLDER') cmds = [serve, '-s', '-n', web_ui, '-l', str(port)] + print(cmds) stdout_file = open(STDOUT_FULL_PATH, 'a+') stderr_file = open(STDERR_FULL_PATH, 'a+') print(cmds) From 0410d05845ae00a15972a80b75c959a44f9646d4 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Mon, 24 Sep 2018 21:39:57 +0800 Subject: [PATCH 20/60] debug --- tools/nnicmd/webui_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 19aed74e36..061a0fdbaf 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -31,7 +31,9 @@ def start_web_ui(port): '''start web ui''' serve = os.environ.get('NNI_SERVE', 'serve') + print(serve) web_ui = os.environ.get('WEB_UI_FOLDER') + print(web_ui) cmds = [serve, '-s', '-n', web_ui, '-l', str(port)] print(cmds) stdout_file = open(STDOUT_FULL_PATH, 'a+') From e3788d2c2fa75fa773af7a54640fcc523b8162a0 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 09:46:30 +0800 Subject: [PATCH 21/60] add exception handler --- .../training_service_tool/trial/hdfsClientUtility.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 80d50310b7..ad281a3aab 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -51,13 +51,15 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): file_path = os.path.join(localDirectory, file) if os.path.isdir(file_path): hdfs_directory = os.path.join(hdfsDirectory, file) - copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) + if not copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient): + return False else: hdfs_file_path = os.path.join(hdfsDirectory, file) if not copyFileToHdfs(file_path, hdfs_file_path, hdfsClient): return False return True - except: + except Exception as exception: + print(exception) return False def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): @@ -71,5 +73,6 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): return False try: return hdfsClient.copy_from_local(localFilePath, hdfsFilePath) - except: + except Exception as exception: + print(exception) return False \ No newline at end of file From 793cbf107570dbc61383e2d75deb490079672e9e Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 10:15:24 +0800 Subject: [PATCH 22/60] fix bug --- .../training_service_tool/trial/hdfsClientUtility.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index ad281a3aab..731c45a1ea 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -47,6 +47,9 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): raise Exception('Local Directory does not exist!') try: hdfsClient.mkdirs(hdfsDirectory) + print('---------list all files in ', localDirectory) + print(os.listdir(localDirectory)) + print('-----------end----------') for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) if os.path.isdir(file_path): From b14c10815ed04ac25229738476b92b2d7670b2a1 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 10:27:41 +0800 Subject: [PATCH 23/60] debug --- src/nni_manager/training_service_tool/trial/trial_keeper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 8c8d67a143..733942fd5b 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -47,6 +47,7 @@ def main_loop(args): print('Subprocess pid is {}'.format(process.pid)) print('Current cwd is {}'.format(os.getcwd())) while True: + print(os.listdir(os.environ['NNI_OUTPUT_DIR'])) retCode = process.poll() if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) From 0ae9f6db7465a0c215ec187ca701c29455095e36 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 10:36:42 +0800 Subject: [PATCH 24/60] fix bug --- .../trial/hdfsClientUtility.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 731c45a1ea..19da3ea731 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -47,18 +47,23 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): raise Exception('Local Directory does not exist!') try: hdfsClient.mkdirs(hdfsDirectory) - print('---------list all files in ', localDirectory) - print(os.listdir(localDirectory)) - print('-----------end----------') for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) if os.path.isdir(file_path): hdfs_directory = os.path.join(hdfsDirectory, file) - if not copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient): + try: + if not copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient): + return False + except Exception as exception: + print(exception) return False else: hdfs_file_path = os.path.join(hdfsDirectory, file) - if not copyFileToHdfs(file_path, hdfs_file_path, hdfsClient): + try: + if not copyFileToHdfs(file_path, hdfs_file_path, hdfsClient): + return False + except Exception as exception: + print(exception) return False return True except Exception as exception: From 5938310c467e963fe650345ff17c724967994a52 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 10:51:54 +0800 Subject: [PATCH 25/60] fix bug --- .../trial/hdfsClientUtility.py | 38 ++++++++----------- .../trial/trial_keeper.py | 1 - 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 19da3ea731..db567e881b 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -45,30 +45,22 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): '''Copy directory from local to hdfs''' if not os.path.exists(localDirectory): raise Exception('Local Directory does not exist!') - try: hdfsClient.mkdirs(hdfsDirectory) - for file in os.listdir(localDirectory): - file_path = os.path.join(localDirectory, file) - if os.path.isdir(file_path): - hdfs_directory = os.path.join(hdfsDirectory, file) - try: - if not copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient): - return False - except Exception as exception: - print(exception) - return False - else: - hdfs_file_path = os.path.join(hdfsDirectory, file) - try: - if not copyFileToHdfs(file_path, hdfs_file_path, hdfsClient): - return False - except Exception as exception: - print(exception) - return False - return True - except Exception as exception: - print(exception) - return False + for file in os.listdir(localDirectory): + file_path = os.path.join(localDirectory, file) + if os.path.isdir(file_path): + hdfs_directory = os.path.join(hdfsDirectory, file) + try: + copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) + except Exception as exception: + print(exception) + else: + hdfs_file_path = os.path.join(hdfsDirectory, file) + try: + copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) + except Exception as exception: + print(exception) + return True def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): '''Copy a local file to hdfs directory''' diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 733942fd5b..8c8d67a143 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -47,7 +47,6 @@ def main_loop(args): print('Subprocess pid is {}'.format(process.pid)) print('Current cwd is {}'.format(os.getcwd())) while True: - print(os.listdir(os.environ['NNI_OUTPUT_DIR'])) retCode = process.poll() if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) From c756188c41fb266e03ad326029c02e548eb72a6d Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 10:53:07 +0800 Subject: [PATCH 26/60] fix bug --- src/nni_manager/training_service_tool/trial/hdfsClientUtility.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index db567e881b..7ff38fd1de 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -48,6 +48,7 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): hdfsClient.mkdirs(hdfsDirectory) for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) + print('------copying ', file_path) if os.path.isdir(file_path): hdfs_directory = os.path.join(hdfsDirectory, file) try: From b6ce81344cd31bb1439bcc1e002f83627a2f20b0 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Tue, 25 Sep 2018 13:57:15 +0800 Subject: [PATCH 27/60] split metrics into single line, and read metrics no matter if subprocess is already quitted --- .../training_service/pai/paiJobRestServer.ts | 14 +++++++++----- .../training_service_tool/trial/trial_keeper.py | 5 ++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 3079cd47ec..0d521e53c0 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -71,11 +71,15 @@ export class PAIJobRestServer extends RestServer{ try { this.log.info(`Get update-metrics request, trial job id is ${req.params.id}`); this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`); - - this.paiTrainingService.MetricsEmitter.emit('metric', { - id : req.body.jobId, - data : req.body.metrics - }); + + // Split metrics array into single metric, then emit + // Warning: If not split metrics into single ones, the behavior will be UNKNOWN + for (const singleMetric of req.body.metrics) { + this.paiTrainingService.MetricsEmitter.emit('metric', { + id : req.body.jobId, + data : singleMetric + }); + } res.send(); } diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 07084691bc..6331ac4ff1 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -45,12 +45,15 @@ def main_loop(args): print('Current cwd is {}'.format(os.getcwd())) while True: retCode = process.poll() + ## Read experiment metrics, to avoid missing metrics + read_experiment_metrics(args.nnimanager_ip) + if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) break else: print('subprocess pid: {} is still alive'.format(process.pid)) - read_experiment_metrics(args.nnimanager_ip) + time.sleep(2) def trial_keeper_help_info(*args): From dc0f96b62c4e94cc01f91839263a46bdc551e9f4 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 14:15:40 +0800 Subject: [PATCH 28/60] add unit test for hdfsClientUtility --- .../trial/hdfsClientUtility.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 7ff38fd1de..d95681631d 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -18,26 +18,6 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# Copyright (c) Microsoft Corporation -# All rights reserved. -# -# MIT License -# -# Permission is hereby granted, free of charge, -# to any person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the Software without restriction, -# including without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and -# to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING -# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - import os from pyhdfs import HdfsClient @@ -48,7 +28,6 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): hdfsClient.mkdirs(hdfsDirectory) for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) - print('------copying ', file_path) if os.path.isdir(file_path): hdfs_directory = os.path.join(hdfsDirectory, file) try: From 55b6e08f57b497ccf4b4899fcd44e83015b21d84 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 14:27:38 +0800 Subject: [PATCH 29/60] fix bug --- .../test/test_hdfsClientUtility.py | 101 ++++++++++++++++++ .../trial/hdfsClientUtility.py | 12 ++- 2 files changed, 109 insertions(+), 4 deletions(-) create mode 100644 src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py diff --git a/src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py b/src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py new file mode 100644 index 0000000000..2d7cc6ed4d --- /dev/null +++ b/src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py @@ -0,0 +1,101 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import unittest +import json +import sys +from pyhdfs import HdfsClient +sys.path.append("..") +from trial.hdfsClientUtility import copyFileToHdfs, copyDirectoryToHdfs +import os +import shutil +import random +import string + +class HDFSClientUtilityTest(unittest.TestCase): + '''Unit test for hdfsClientUtility.py''' + def setUp(self): + self.hdfs_file_path = '../../.vscode/hdfsInfo.json' + self.hdfs_config = None + try: + with open(self.hdfs_file_path, 'r') as file: + self.hdfs_config = json.load(file) + except Exception as exception: + print(exception) + + self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format(self.hdfs_config['host'], '50070'), user_name=self.hdfs_config['userName']) + + def get_random_name(self, length): + return ''.join(random.sample(string.ascii_letters + string.digits, length)) + + def test_copy_file_run(self): + '''test copyFileToHdfs''' + file_name = self.get_random_name(8) + file_content = 'hello world!' + + with open('./{}'.format(file_name), 'w') as file: + file.write(file_content) + + result = copyFileToHdfs('./{}'.format(file_name), '/{0}/{1}'.format(self.hdfs_config['userName'], file_name), self.hdfs_client) + self.assertTrue(result) + + file_list = self.hdfs_client.listdir('/{0}'.format(self.hdfs_config['userName'])) + self.assertIn(file_name, file_list) + + hdfs_file_name = self.get_random_name(8) + self.hdfs_client.copy_to_local('/{0}/{1}'.format(self.hdfs_config['userName'], file_name), './{}'.format(hdfs_file_name)) + self.assertTrue(os.path.exists('./{}'.format(hdfs_file_name))) + + with open('./{}'.format(hdfs_file_name), 'r') as file: + content = file.readline() + self.assertEqual(file_content, content) + #clean up + os.remove('./{}'.format(file_name)) + os.remove('./{}'.format(hdfs_file_name)) + self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], file_name)) + + def test_copy_directory_run(self): + '''test copyDirectoryToHdfs''' + directory_name = self.get_random_name(8) + file_name_list = [self.get_random_name(8), self.get_random_name(8)] + file_content = 'hello world!' + + os.makedirs('./{}'.format(directory_name)) + for file_name in file_name_list: + with open('./{0}/{1}'.format(directory_name, file_name), 'w') as file: + file.write(file_content) + + result = copyDirectoryToHdfs('./{}'.format(directory_name), '/{0}/{1}'.format(self.hdfs_config['userName'], directory_name), self.hdfs_client) + self.assertTrue(result) + + directory_list = self.hdfs_client.listdir('/{0}'.format(self.hdfs_config['userName'])) + self.assertIn(directory_name, directory_list) + + sub_file_list = self.hdfs_client.listdir('/{0}/{1}'.format(self.hdfs_config['userName'], directory_name)) + for file_name in file_name_list: + self.assertIn(file_name, sub_file_list) + #clean up + self.hdfs_client.delete('/{0}/{1}/{2}'.format(self.hdfs_config['userName'], directory_name, file_name)) + self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'], directory_name)) + + shutil.rmtree('./{}'.format(directory_name)) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index d95681631d..1bbfa258ca 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -26,21 +26,24 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): if not os.path.exists(localDirectory): raise Exception('Local Directory does not exist!') hdfsClient.mkdirs(hdfsDirectory) + result = True for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) if os.path.isdir(file_path): hdfs_directory = os.path.join(hdfsDirectory, file) try: - copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) + result = result and copyDirectoryToHdfs(file_path, hdfs_directory, hdfsClient) except Exception as exception: print(exception) + result = False else: hdfs_file_path = os.path.join(hdfsDirectory, file) try: - copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) + result = result and copyFileToHdfs(file_path, hdfs_file_path, hdfsClient) except Exception as exception: print(exception) - return True + result = False + return result def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): '''Copy a local file to hdfs directory''' @@ -52,7 +55,8 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): else: return False try: - return hdfsClient.copy_from_local(localFilePath, hdfsFilePath) + hdfsClient.copy_from_local(localFilePath, hdfsFilePath) + return True except Exception as exception: print(exception) return False \ No newline at end of file From 2529376f92f5e9886be5eba0e396569af0ebdce7 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Tue, 25 Sep 2018 15:20:27 +0800 Subject: [PATCH 30/60] Add experiment id in update metrics url to differ trials --- src/nni_manager/training_service/pai/paiData.ts | 4 ++-- src/nni_manager/training_service/pai/paiJobRestServer.ts | 9 ++++++--- .../training_service/pai/paiTrainingService.ts | 1 + .../training_service_tool/trial/metrics_reader.py | 3 ++- src/nni_manager/training_service_tool/trial/url_utils.py | 4 ++-- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 2c8bb6b868..e05c9df3ec 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -47,6 +47,6 @@ export class PAITrialJobDetail implements TrialJobDetail { export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai -&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{2}' --nnimanager_ip '{3}'`; \ No newline at end of file +&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 0d521e53c0..098ea74333 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -22,9 +22,10 @@ import { Request, Response, Router } from 'express'; import * as bodyParser from 'body-parser'; import * as component from '../../common/component'; -import { RestServer } from '../../common/restServer' +import { getExperimentId } from '../../common/experimentStartupInfo'; import { Inject } from 'typescript-ioc'; import { PAITrainingService } from './paiTrainingService'; +import { RestServer } from '../../common/restServer' /** * PAI Training service Rest server, provides rest API to support pai job metrics update @@ -37,6 +38,8 @@ export class PAIJobRestServer extends RestServer{ private readonly API_ROOT_URL: string = '/api/v1/nni-pai'; + private readonly expId: string = getExperimentId(); + @Inject private readonly paiTrainingService : PAITrainingService; @@ -67,9 +70,9 @@ export class PAIJobRestServer extends RestServer{ next(); }); - router.post('/update-metrics/:id', (req: Request, res: Response) => { + router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => { try { - this.log.info(`Get update-metrics request, trial job id is ${req.params.id}`); + this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`); this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`); // Split metrics array into single metric, then emit diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 1764f3828a..efaa9a7c14 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -167,6 +167,7 @@ class PAITrainingService implements TrainingService { // PAI will copy job's codeDir into /root directory `/root/${trialJobId}`, trialJobId, + this.experimentId, this.paiTrialConfig.command, getIPV4Address() ).replace(/\r\n|\n|\r/gm, ''); diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py index 1d347b5548..6178e657a3 100644 --- a/src/nni_manager/training_service_tool/trial/metrics_reader.py +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -30,6 +30,7 @@ NNI_SYS_DIR = os.environ['NNI_SYS_DIR'] NNI_TRIAL_JOB_ID = os.environ['NNI_TRIAL_JOB_ID'] +NNI_EXP_ID = os.environ['NNI_EXP_ID'] LEN_FIELD_SIZE = 6 MAGIC = 'ME' @@ -114,7 +115,7 @@ def read_experiment_metrics(nnimanager_ip): result['metrics'] = reader.read_trial_metrics() print('Result metrics is {}'.format(json.dumps(result))) if len(result['metrics']) > 0: - response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_TRIAL_JOB_ID), json.dumps(result), 10) + response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_EXP_ID, NNI_TRIAL_JOB_ID), json.dumps(result), 10) print('Response code is {}'.format(response.status_code)) except Exception: #TODO error logging to file diff --git a/src/nni_manager/training_service_tool/trial/url_utils.py b/src/nni_manager/training_service_tool/trial/url_utils.py index de5b424f07..69ce14ecb2 100644 --- a/src/nni_manager/training_service_tool/trial/url_utils.py +++ b/src/nni_manager/training_service_tool/trial/url_utils.py @@ -20,6 +20,6 @@ from .constants import API_ROOT_URL, UPDATE_METRICS_API -def gen_update_metrics_url(base_url, port, trial_job_id): +def gen_update_metrics_url(base_url, port, exp_id, trial_job_id): '''Generate update trial metrics url''' - return '{0}:{1}{2}{3}/:{4}'.format(base_url, port, API_ROOT_URL, UPDATE_METRICS_API, trial_job_id) \ No newline at end of file + return '{0}:{1}{2}{3}/{4}/:{5}'.format(base_url, port, API_ROOT_URL, UPDATE_METRICS_API, exp_id, trial_job_id) \ No newline at end of file From 0f7d40c4da7a796a39c56656e4cae6b128d6bdd7 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 15:54:36 +0800 Subject: [PATCH 31/60] add default outputdir --- src/nni_manager/training_service/pai/paiConfig.ts | 2 +- src/nni_manager/training_service/pai/paiData.ts | 5 ++++- .../training_service/pai/paiTrainingService.ts | 9 ++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index 782e0790f1..aa84021ec4 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -110,7 +110,7 @@ export class NNIPAITrialConfig extends TrialConfig{ public readonly memoryMB: number; public readonly image: string; public readonly dataDir: string; - public readonly outputDir: string; + public outputDir: string; constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { super(command, codeDir, gpuNum); diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index d1c36a09ec..8b7f2e8c20 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -49,4 +49,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} NNI_HDFS_OUTPUT_DIR={2} NNI_USER_NAME={3} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}'`; \ No newline at end of file +&& python3 -m trial.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}'`; + +export const PAI_OUTPUT_DIR_FORMAT: string = +`hdfs://{0}:9000/{1}`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index fbf332fb22..8defc9ac5c 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -38,7 +38,7 @@ import { import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { ObservableTimer } from '../../common/observableTimer'; import { PAIJobRestServer } from './paiJobRestServer' -import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; +import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT } from './paiData'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { String } from 'typescript-string-operations'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; @@ -303,6 +303,13 @@ class PAITrainingService implements TrainingService { break; } this.paiTrialConfig = JSON.parse(value); + if(this.paiTrialConfig.outputDir === undefined){ + this.paiTrialConfig.outputDir = String.Format( + PAI_OUTPUT_DIR_FORMAT, + this.paiClusterConfig.host, + this.paiClusterConfig.userName + ).replace(/\r\n|\n|\r/gm, ''); + } console.log(`Set Cluster metadata: paiTrialConfig is ${JSON.stringify(this.paiTrialConfig)}`); deferred.resolve(); break; From 9c53f47791bba5a31aefbf39407b6844242ee73f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 17:11:31 +0800 Subject: [PATCH 32/60] fix trial_keeper --- .../trial/trial_keeper.py | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 3ad00a08f2..a536097cc9 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -55,25 +55,13 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs local_directory = os.environ['NNI_OUTPUT_DIR'] - hdfs_output_dir = os.environ['NNI_HDFS_OUTPUT_DIR'] + hdfs_host = os.environ['NNI_HDFS_HOST'] nni_user_name = os.environ['NNI_USER_NAME'] trial_job_id = os.environ['NNI_TRIAL_JOB_ID'] exp_id = os.environ['NNI_EXP_ID'] - #get hdfs_host and hdfs_directory - hdfs_host_pattern = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' - hdfs_host = re.findall(hdfs_host_pattern, hdfs_output_dir) - hdfs_directory = hdfs_output_dir.replace(hdfs_host[0], '') - #get url_host - url_host_pattern = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' - url_host = re.findall(url_host_pattern, hdfs_host[0]) - #init hdfs client - if not os.path.isdir(local_directory): - raise Exception('Local Directory Error!') - #get local folder name - hdfs_output_dir_full = os.path.join(hdfs_directory, exp_id, trial_job_id) - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(url_host[0], '50070'), user_name=nni_user_name) - print(local_directory, hdfs_output_dir_full) - if copyDirectoryToHdfs(local_directory, hdfs_output_dir_full, hdfs_client): + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(hdfs_host, '50070'), user_name=nni_user_name) + print(local_directory, args.pai_hdfs_output_dir) + if copyDirectoryToHdfs(local_directory, args.pai_hdfs_output_dir, hdfs_client): print('copy directory success!') else: print('copy directory failed!') From beac29c0e0f3ff36c25af8a5dc8d83d75e68c042 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 17:21:37 +0800 Subject: [PATCH 33/60] fix bug --- src/nni_manager/training_service/pai/paiData.ts | 2 +- src/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 1973208708..8e6e7b1c1f 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -50,7 +50,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} NNI_HDFS_HOST={3} NNI_USER_NAME={4} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --`; +&& python3 -m trial.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --pai_hdfs_output_dir '{7}'`; export const PAI_OUTPUT_DIR_FORMAT: string = `hdfs://{0}:9000/{1}`; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 44ab4aea34..c2ef28bd18 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -185,8 +185,8 @@ class PAITrainingService implements TrainingService { `/root/${trialJobId}`, trialJobId, this.experimentId, - this.paiTrialConfig.outputDir, hdfsHost[0], + this.paiClusterConfig.userName, this.paiTrialConfig.command, getIPV4Address(), hdfsOutputDir From c54ad7a1a1ee40bfa63d7919a99e242a2beda8a2 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 17:31:45 +0800 Subject: [PATCH 34/60] add default value for nnioutputdir --- src/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index c2ef28bd18..f0b650a754 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -322,7 +322,7 @@ class PAITrainingService implements TrainingService { break; } this.paiTrialConfig = JSON.parse(value); - if(this.paiTrialConfig.outputDir === undefined){ + if(this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null){ this.paiTrialConfig.outputDir = String.Format( PAI_OUTPUT_DIR_FORMAT, this.paiClusterConfig.host, From c214362df97bf3b04a6ca0297fa7fc7cbcc2e507 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 17:38:40 +0800 Subject: [PATCH 35/60] fix bug --- tools/nnicmd/config_schema.py | 5 +---- tools/nnicmd/launcher.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index 86164f6a88..ba1879923b 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -41,10 +41,7 @@ 'codeDir': os.path.exists, 'classFileName': str, 'className': str, - 'classArgs': { - 'optimize_mode': Or('maximize', 'minimize'), - Optional('speed'): int - }, + Optional('classArgs'): dict, Optional('gpuNum'): And(int, lambda x: 0 <= x <= 99999), }), 'trial':{ diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index 5787627a6d..0f94b597c2 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -146,11 +146,16 @@ def set_experiment(experiment_config, mode, port): value_dict['command'] = experiment_config['trial']['command'] value_dict['codeDir'] = experiment_config['trial']['codeDir'] value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] - value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] - value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] - value_dict['image'] = experiment_config['trial']['image'] - value_dict['dataDir'] = experiment_config['trial']['dataDir'] - value_dict['outputDir'] = experiment_config['trial']['outputDir'] + if experiment_config['trial'].get('cpuNum'): + value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] + if experiment_config['trial'].get('memoryMB'): + value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] + if experiment_config['trial'].get('image'): + value_dict['image'] = experiment_config['trial']['image'] + if experiment_config['trial'].get('dataDir'): + value_dict['dataDir'] = experiment_config['trial']['dataDir'] + if experiment_config['trial'].get('outputDir'): + value_dict['outputDir'] = experiment_config['trial']['outputDir'] request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': value_dict}) From 60bf77050cf02c8ea3df03a22f41c0ffa7959202 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 17:55:12 +0800 Subject: [PATCH 36/60] remove unused code --- tools/nnicmd/webui_utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/nnicmd/webui_utils.py b/tools/nnicmd/webui_utils.py index 061a0fdbaf..1121452a08 100644 --- a/tools/nnicmd/webui_utils.py +++ b/tools/nnicmd/webui_utils.py @@ -31,17 +31,10 @@ def start_web_ui(port): '''start web ui''' serve = os.environ.get('NNI_SERVE', 'serve') - print(serve) web_ui = os.environ.get('WEB_UI_FOLDER') - print(web_ui) cmds = [serve, '-s', '-n', web_ui, '-l', str(port)] - print(cmds) stdout_file = open(STDOUT_FULL_PATH, 'a+') stderr_file = open(STDERR_FULL_PATH, 'a+') - print(cmds) - if not serve or not web_ui: - print_error('Failed to start webui!') - return None webui_process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) if webui_process.returncode is None: webui_url_list = [] From fad2ba359d74a8fcac6b8d1e34571b7206a3b50d Mon Sep 17 00:00:00 2001 From: fishyds Date: Tue, 25 Sep 2018 18:06:00 +0800 Subject: [PATCH 37/60] PAI Training service implementation, v1 (#1) * PAI Training service implementation, v1 * update trial package directory in setup.py * Update setup.py package info * Update trial keeper module, use IP adress for pai training service machine * Update metrics file path in reader * Fix metrics file path issue * Update pai integration, full implementation of pai training service * Do not send metrics if it is empty * Update nnictl, to support pai configuration * split metrics into single line, and read metrics no matter if subprocess is already quitted * Add experiment id in update metrics url to differ trials --- Makefile | 18 +- setup.py | 5 +- src/nni_manager/common/utils.ts | 18 +- src/nni_manager/main.ts | 7 +- src/nni_manager/package.json | 4 +- .../rest_server/restValidationSchemas.ts | 14 +- .../training_service/common/jobMetrics.ts | 37 ++ .../common/trialConfigMetadataKey.ts | 3 +- .../training_service/pai/hdfsClientUtility.ts | 200 ++++++++++ .../training_service/pai/paiConfig.ts | 123 +++++++ .../training_service/pai/paiData.ts | 52 +++ .../pai/paiJobInfoCollector.ts | 136 +++++++ .../training_service/pai/paiJobRestServer.ts | 80 +++- .../pai/paiTrainingService.ts | 345 ++++++++++++++++++ .../training_service/pai/paiTrialConfig.ts | 39 ++ .../remote_machine/metricsCollector.ts | 3 +- .../remote_machine/remoteMachineData.ts | 19 +- .../remoteMachineTrainingService.ts | 8 +- .../test/hdfsClientUtility.test.ts | 143 ++++++++ .../test/paiTrainingService.test.ts | 95 +++++ .../training_service_tool/setup.py | 20 + .../training_service_tool/trial/__init__.py | 0 .../training_service_tool/trial/constants.py | 37 ++ .../trial/metrics_reader.py | 124 +++++++ .../training_service_tool/trial/rest_utils.py | 57 +++ .../trial/trial_keeper.py | 77 ++++ .../training_service_tool/trial/url_utils.py | 25 ++ src/nni_manager/yarn.lock | 116 +++++- src/sdk/pynni/nni/platform/__init__.py | 2 +- tools/nnicmd/config_schema.py | 42 +-- tools/nnicmd/launcher.py | 55 ++- 31 files changed, 1824 insertions(+), 80 deletions(-) create mode 100644 src/nni_manager/training_service/common/jobMetrics.ts create mode 100644 src/nni_manager/training_service/pai/hdfsClientUtility.ts create mode 100644 src/nni_manager/training_service/pai/paiConfig.ts create mode 100644 src/nni_manager/training_service/pai/paiData.ts create mode 100644 src/nni_manager/training_service/pai/paiJobInfoCollector.ts create mode 100644 src/nni_manager/training_service/pai/paiTrainingService.ts create mode 100644 src/nni_manager/training_service/pai/paiTrialConfig.ts create mode 100644 src/nni_manager/training_service/test/hdfsClientUtility.test.ts create mode 100644 src/nni_manager/training_service/test/paiTrainingService.test.ts create mode 100644 src/nni_manager/training_service_tool/setup.py create mode 100644 src/nni_manager/training_service_tool/trial/__init__.py create mode 100644 src/nni_manager/training_service_tool/trial/constants.py create mode 100644 src/nni_manager/training_service_tool/trial/metrics_reader.py create mode 100644 src/nni_manager/training_service_tool/trial/rest_utils.py create mode 100644 src/nni_manager/training_service_tool/trial/trial_keeper.py create mode 100644 src/nni_manager/training_service_tool/trial/url_utils.py diff --git a/Makefile b/Makefile index f429fc20bb..1604fb1937 100644 --- a/Makefile +++ b/Makefile @@ -90,14 +90,9 @@ build: #$(_INFO) Building nnictl $(_END) cd tools && python3 setup.py build + #$(_INFO) Building Training Service tool $(_END) + cd src/nni_manager/training_service_tool && python3 setup.py build -# Standard installation target -# Must be invoked after building -.PHONY: install -install: install-python-modules -install: install-node-modules -install: install-scripts -install: install-examples install: #$(_INFO) Complete! You may want to add $(BIN_PATH) to your PATH environment $(_END) @@ -107,6 +102,7 @@ install: .PHONY: remote-machine-install remote-machine-install: cd src/sdk/pynni && python3 setup.py install $(PIP_MODE) + cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) # All-in-one target for non-expert users @@ -145,6 +141,7 @@ dev-install: uninstall: -pip3 uninstall -y nni -pip3 uninstall -y nnictl + -pip3 uninstall -y nnits-tool -rm -rf $(INSTALL_PREFIX)/nni -rm -f $(BIN_PATH)/nnimanager -rm -f $(BIN_PATH)/nnictl @@ -203,6 +200,8 @@ install-python-modules: #$(_INFO) Installing nnictl $(_END) cd tools && python3 setup.py install $(PIP_MODE) + #$(_INFO) Installing NNI training service tool $(_END) + cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) .PHONY: install-node-modules install-node-modules: @@ -223,7 +222,10 @@ install-dev-modules: #$(_INFO) Installing nnictl $(_END) cd tools && pip3 install $(PIP_MODE) -e . - + + #$(_INFO) Installing NNI training service tool $(_END) + cd src/nni_manager/training_service_tool && pip3 install $(PIP_MODE) -e . + mkdir -p $(INSTALL_PREFIX)/nni #$(_INFO) Installing NNI Manager $(_END) diff --git a/setup.py b/setup.py index eeee54d075..6e0782ccdf 100644 --- a/setup.py +++ b/setup.py @@ -65,11 +65,12 @@ def run(self): license = 'MIT', url = 'https://github.com/Microsoft/nni', - packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools'), + packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools') + find_packages('src/nni_manager/training_service_tool'), package_dir = { 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', - 'nnicmd': 'tools/nnicmd' + 'nnicmd': 'tools/nnicmd', + 'trial': 'src/nni_manager/training_service_tool/trial' }, python_requires = '>=3.5', install_requires = [ diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index ba0650ef28..805d3ac4b0 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -229,5 +229,19 @@ function cleanupUnitTest(): void { Container.restore(ExperimentStartupInfo); } -export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest, - parseArg, cleanupUnitTest, uniqueString, randomSelect }; +/** + * Get IPv4 address of current machine + */ +function getIPV4Address(): string { + let ipv4Address : string = ''; + + for(const item of os.networkInterfaces().eth0) { + if(item.family === 'IPv4') { + ipv4Address = item.address; + } + } + return ipv4Address; +} + +export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, getIPV4Address, + mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect }; diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 6d9c9fa64b..f3d386eccd 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -36,6 +36,7 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; +import { PAITrainingService } from './training_service/pai/paiTrainingService' function initStartupInfo(startExpMode: string, resumeExperimentId: string) { @@ -49,6 +50,8 @@ async function initContainer(platformMode: string): Promise { Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton); } else if (platformMode === 'remote') { Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton); + } else if (platformMode === 'pai'){ + Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton); } else { throw new Error(`Error: unsupported mode: ${mode}`); } @@ -61,7 +64,7 @@ async function initContainer(platformMode: string): Promise { } function usage(): void { - console.info('usage: node main.js --port --mode --start_mode --experiment_id '); + console.info('usage: node main.js --port --mode --start_mode --experiment_id '); } let port: number = NNIRestServer.DEFAULT_PORT; @@ -71,7 +74,7 @@ if (strPort && strPort.length > 0) { } const mode: string = parseArg(['--mode', '-m']); -if (!['local', 'remote'].includes(mode)) { +if (!['local', 'remote', 'pai'].includes(mode)) { usage(); process.exit(1); } diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 46522044fd..04ee4df3c2 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -23,7 +23,8 @@ "tree-kill": "^1.2.0", "ts-deferred": "^1.0.4", "typescript-ioc": "^1.2.4", - "typescript-string-operations": "^1.3.1" + "typescript-string-operations": "^1.3.1", + "webhdfs":"^1.2.0" }, "devDependencies": { "@types/chai": "^4.1.4", @@ -40,6 +41,7 @@ "chai": "^4.1.2", "mocha": "^5.2.0", "request": "^2.87.0", + "rmdir": "^1.2.0", "tmp": "^0.0.33", "ts-node": "^7.0.0", "tslint": "^5.11.0", diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 218a8c22c4..22f1acb222 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -33,9 +33,19 @@ export namespace ValidationSchemas { passphrase: joi.string() })), trial_config: joi.object({ - gpuNum: joi.number().min(0).required(), + image: joi.string().min(1), codeDir: joi.string().min(1).required(), - command: joi.string().min(1).required() + dataDir: joi.string(), + outputDir: joi.string(), + cpuNum: joi.number().min(1), + memoryMB: joi.number().min(100), + gpuNum: joi.number().min(0).required(), + command: joi.string().min(1).required() + }), + pai_config: joi.object({ + userName: joi.string().min(1).required(), + passWord: joi.string().min(1).required(), + host: joi.string().min(1).required() }) } }; diff --git a/src/nni_manager/training_service/common/jobMetrics.ts b/src/nni_manager/training_service/common/jobMetrics.ts new file mode 100644 index 0000000000..90228ffa7d --- /dev/null +++ b/src/nni_manager/training_service/common/jobMetrics.ts @@ -0,0 +1,37 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { TrialJobStatus } from '../../common/trainingService'; + +// tslint:disable-next-line:max-classes-per-file +export class JobMetrics { + public readonly jobId: string; + public readonly metrics: string[]; + public readonly jobStatus: TrialJobStatus; + public readonly endTimestamp: number; + + constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) { + this.jobId = jobId; + this.metrics = metrics; + this.jobStatus = jobStatus; + this.endTimestamp = endTimestamp; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index e9749e562e..12df449ee1 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey { MACHINE_LIST = 'machine_list', TRIAL_CONFIG = 'trial_config', EXPERIMENT_ID = 'experimentId', - RANDOM_SCHEDULER = 'random_scheduler' + RANDOM_SCHEDULER = 'random_scheduler', + PAI_CLUSTER_CONFIG = 'pai_config' } diff --git a/src/nni_manager/training_service/pai/hdfsClientUtility.ts b/src/nni_manager/training_service/pai/hdfsClientUtility.ts new file mode 100644 index 0000000000..07dcc2a744 --- /dev/null +++ b/src/nni_manager/training_service/pai/hdfsClientUtility.ts @@ -0,0 +1,200 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +import * as path from 'path'; +import * as fs from 'fs'; +import { Deferred } from 'ts-deferred'; +import { getLogger } from '../../common/log'; + +/** + * HDFS client utility, including copy file/directory + */ +export namespace HDFSClientUtility { + /** + * Copy a local file to hdfs directory + * + * @param localFilePath local file path(source) + * @param hdfsFilePath hdfs file path(target) + * @param hdfsClient hdfs client + */ + export async function copyFileToHdfs(localFilePath : string, hdfsFilePath : string, hdfsClient : any) : Promise { + const deferred: Deferred = new Deferred(); + fs.exists(localFilePath, (exists : boolean) => { + // Detect if local file exist + if (exists) { + var localFileStream = fs.createReadStream(localFilePath); + var hdfsFileStream = hdfsClient.createWriteStream(hdfsFilePath); + localFileStream.pipe(hdfsFileStream); + hdfsFileStream.on('finish', function onFinish () { + deferred.resolve(); + }); + hdfsFileStream.on('error', (err : any) => { + getLogger().error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`); + deferred.reject(err); + }); + } else { + getLogger().error(`HDFSCientUtility:copyFileToHdfs, ${localFilePath} doesn't exist locally`); + deferred.reject('file not exist!'); + } + }); + return deferred.promise; + } + + /** + * Recursively copy local directory to hdfs directory + * + * @param localDirectory local directory + * @param hdfsDirectory HDFS directory + * @param hdfsClient HDFS client + */ + export async function copyDirectoryToHdfs(localDirectory : string, hdfsDirectory : string, hdfsClient : any) : Promise{ + const deferred: Deferred = new Deferred(); + // TODO: fs.readdirSync doesn't support ~($HOME) + const fileNameArray: string[] = fs.readdirSync(localDirectory); + + for(var fileName of fileNameArray){ + const fullFilePath: string = path.join(localDirectory, fileName); + try { + if (fs.lstatSync(fullFilePath).isFile()) { + await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient); + } else { + // If filePath is a directory, recuisively copy it to remote directory + await copyDirectoryToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient); + } + } catch(error) { + deferred.reject(error); + } + } + // All files/directories are copied successfully, resolve + deferred.resolve(); + + return deferred.promise; + } + + /** + * Read content from HDFS file + * + * @param hdfsPath HDFS file path + * @param hdfsClient HDFS client + */ + export async function readFileFromHDFS(hdfsPath : string, hdfsClient :any) : Promise { + const deferred: Deferred = new Deferred(); + let buffer : Buffer = Buffer.alloc(0); + + const exist : boolean = await pathExists(hdfsPath, hdfsClient); + if(!exist) { + deferred.reject(`${hdfsPath} doesn't exists`); + } + + const remoteFileStream = hdfsClient.createReadStream(hdfsPath); + remoteFileStream.on('error', (err : any) => { + // Reject with the error + deferred.reject(err); + }); + + remoteFileStream.on('data', (chunk : any) => { + // Concat the data chunk to buffer + buffer = Buffer.concat([buffer, chunk]); + }); + + remoteFileStream.on('finish', function onFinish () { + // Upload is done, resolve + deferred.resolve(buffer); + }); + + return deferred.promise; + } + + /** + * Check if an HDFS path already exists + * + * @param hdfsPath target path need to check in HDFS + * @param hdfsClient HDFS client + */ + export async function pathExists(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + hdfsClient.exists(hdfsPath, (exist : boolean ) => { + deferred.resolve(exist); + }) + + return deferred.promise; + } + + /** + * Mkdir in HDFS, use default permission 755 + * + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + */ + export function mkdir(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + + hdfsClient.mkdir(hdfsPath, (err : any)=> { + if(!err) { + deferred.resolve(true); + } else { + deferred.reject(err.message); + } + }); + + return deferred.promise; + } + + /** + * Read directory contents + * + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + */ + export async function readdir(hdfsPath : string, hdfsClient : any) : Promise { + const deferred : Deferred = new Deferred(); + const exist : boolean = await pathExists(hdfsPath, hdfsClient); + if(!exist) { + deferred.reject(`${hdfsPath} doesn't exists`); + } + + hdfsClient.readdir(hdfsPath, (err : any, files : any[] ) => { + if(err) { + deferred.reject(err); + } + + deferred.resolve(files); + }); + + return deferred.promise; + } + + /** + * Delete HDFS path + * @param hdfsPath the path in HDFS. It could be either file or directory + * @param hdfsClient + * @param recursive Mark if need to delete recursively + */ + export function deletePath(hdfsPath : string, hdfsClient : any, recursive : boolean = true) : Promise { + const deferred : Deferred = new Deferred(); + hdfsClient.unlink(hdfsPath, recursive, (err : any)=> { + if(!err) { + deferred.resolve(true); + } else { + deferred.reject(err.message); + } + }); + return deferred.promise; + } +} diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts new file mode 100644 index 0000000000..782e0790f1 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -0,0 +1,123 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import {TrialConfig} from '../common/trialConfig' + +export class PAITaskRole { + // Name for the task role + public readonly name: string; + // Number of tasks for the task role, no less than 1 + public readonly taskNumber: number; + // CPU number for one task in the task role, no less than 1 + public readonly cpuNumber: number; + // Memory for one task in the task role, no less than 100 + public readonly memoryMB: number; + // GPU number for one task in the task role, no less than 0 + public readonly gpuNumber: number; + // Executable command for tasks in the task role, can not be empty + public readonly command: string; + + /** + * Constructor + * @param name Name for the task role + * @param taskNumber Number of tasks for the task role, no less than 1 + * @param cpuNumber CPU number for one task in the task role, no less than 1 + * @param memoryMB Memory for one task in the task role, no less than 100 + * @param gpuNumber GPU number for one task in the task role, no less than 0 + * @param command Executable command for tasks in the task role, can not be empty + */ + constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, command : string) { + this.name = name; + this.taskNumber = taskNumber; + this.cpuNumber = cpuNumber; + this.memoryMB = memoryMB; + this.gpuNumber = gpuNumber; + this.command = command; + } +} + +export class PAIJobConfig{ + // Name for the job, need to be unique + public readonly jobName: string; + // URL pointing to the Docker image for all tasks in the job + public readonly image: string; + // Data directory existing on HDFS + public readonly dataDir: string; + // Output directory on HDFS + public readonly outputDir: string; + // Code directory on HDFS + public readonly codeDir: string; + + // List of taskRole, one task role at least + public taskRoles: PAITaskRole[]; + + /** + * Constructor + * @param jobName Name for the job, need to be unique + * @param image URL pointing to the Docker image for all tasks in the job + * @param dataDir Data directory existing on HDFS + * @param outputDir Output directory on HDFS + * @param taskRoles List of taskRole, one task role at least + */ + constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, taskRoles : PAITaskRole[]){ + this.jobName = jobName; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + this.codeDir = codeDir; + this.taskRoles = taskRoles; + } +} + +export class PAIClusterConfig { + public readonly userName: string; + public readonly passWord: string; + public readonly host: string; + + /** + * Constructor + * @param userName User name of PAI Cluster + * @param passWord password of PAI Cluster + * @param host Host IP of PAI Cluster + */ + constructor(userName: string, passWord : string, host : string){ + this.userName = userName; + this.passWord = passWord; + this.host = host; + } +} + +export class NNIPAITrialConfig extends TrialConfig{ + public readonly cpuNum: number; + public readonly memoryMB: number; + public readonly image: string; + public readonly dataDir: string; + public readonly outputDir: string; + + constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { + super(command, codeDir, gpuNum); + this.cpuNum = cpuNum; + this.memoryMB = memoryMB; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts new file mode 100644 index 0000000000..e05c9df3ec --- /dev/null +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -0,0 +1,52 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from 'common/trainingService'; + +export class PAITrialJobDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public paiJobName: string; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: JobApplicationForm; + + constructor(id: string, status: TrialJobStatus, paiJobName : string, + submitTime: number, workingDirectory: string, form: JobApplicationForm) { + this.id = id; + this.status = status; + this.paiJobName = paiJobName; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + } +} + +export const PAI_TRIAL_COMMAND_FORMAT: string = +`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} +&& cd $NNI_SYS_DIR && mkdir .nni +&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}'`; \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts new file mode 100644 index 0000000000..f347205b80 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -0,0 +1,136 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as request from 'request'; +import { EventEmitter } from 'events'; +import { Deferred } from 'ts-deferred'; +import { getLogger, Logger } from '../../common/log'; +import { NNIError, NNIErrorNames } from '../../common/errors'; +import { PAITrialJobDetail } from './paiData'; +import { PAIClusterConfig } from './paiConfig'; +import { TrialJobStatus } from '../../common/trainingService'; + +/** + * Collector PAI jobs info from PAI cluster, and update pai job status locally + */ +export class PAIJobInfoCollector { + private readonly trialJobsMap : Map; + private readonly log: Logger = getLogger(); + private readonly statusesNeedToCheck : TrialJobStatus[]; + private readonly finalStatuses : TrialJobStatus[]; + + constructor(jobMap: Map) { + this.trialJobsMap = jobMap; + this.statusesNeedToCheck = ['RUNNING', 'UNKNOWN', 'WAITING']; + this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED']; + } + + public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise { + if (!paiClusterConfig || !paiToken) { + return Promise.resolve(); + } + + const updatePaiTrialJobs : Promise[] = []; + for(let [trialJobId, paiTrialJob] of this.trialJobsMap) { + if (!paiTrialJob) { + throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); + } + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig)) + } + + await Promise.all(updatePaiTrialJobs); + } + + private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig) : Promise { + const deferred : Deferred = new Deferred(); + if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { + deferred.resolve(); + return deferred.promise; + } + + // Rest call to get PAI job info and update status + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const getJobInfoRequest: request.Options = { + uri: `http://${paiClusterConfig.host}:9186/api/v1/jobs/${paiTrialJob.paiJobName}`, + method: 'GET', + json: true, + headers: { + "Content-Type": "application/json", + "Authorization": 'Bearer ' + paiToken + } + }; + //TODO : pass in request timeout param? + request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => { + if (error || response.statusCode >= 500) { + this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`); + // Queried PAI job info failed, set job status to UNKNOWN + if(paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') { + paiTrialJob.status = 'UNKNOWN'; + } + } else { + if(response.body.jobStatus && response.body.jobStatus.state) { + console.log(`*****IN getSinglePAITrialJobInfo: response body state is ${response.body.jobStatus.state}`); + switch(response.body.jobStatus.state) { + case 'WAITING': + paiTrialJob.status = 'WAITING'; + break; + case 'RUNNING': + paiTrialJob.status = 'RUNNING'; + if(!paiTrialJob.startTime) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + } + if(!paiTrialJob.url) { + paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + } + break; + case 'SUCCEEDED': + paiTrialJob.status = 'SUCCEEDED'; + break; + case 'STOPPED': + paiTrialJob.status = 'USER_CANCELED'; + break; + case 'FAILED': + paiTrialJob.status = 'FAILED'; + break; + default: + paiTrialJob.status = 'UNKNOWN'; + break; + } + // For final job statues, update startTime, endTime and url + if(this.finalStatuses.includes(paiTrialJob.status)) { + if(!paiTrialJob.startTime) { + paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime; + } + if(!paiTrialJob.endTime) { + paiTrialJob.endTime = response.body.jobStatus.completedTime; + } + if(!paiTrialJob.url) { + paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + } + } + } + } + deferred.resolve(); + }); + + return deferred.promise; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiJobRestServer.ts b/src/nni_manager/training_service/pai/paiJobRestServer.ts index 6375eee1c5..098ea74333 100644 --- a/src/nni_manager/training_service/pai/paiJobRestServer.ts +++ b/src/nni_manager/training_service/pai/paiJobRestServer.ts @@ -17,4 +17,82 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - \ No newline at end of file +'use strict'; + +import { Request, Response, Router } from 'express'; +import * as bodyParser from 'body-parser'; +import * as component from '../../common/component'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { Inject } from 'typescript-ioc'; +import { PAITrainingService } from './paiTrainingService'; +import { RestServer } from '../../common/restServer' + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +@component.Singleton +export class PAIJobRestServer extends RestServer{ + /** NNI main rest service default port */ + private static readonly DEFAULT_PORT: number = 51189; + + private readonly API_ROOT_URL: string = '/api/v1/nni-pai'; + + private readonly expId: string = getExperimentId(); + + @Inject + private readonly paiTrainingService : PAITrainingService; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor() { + super(); + this.port = PAIJobRestServer.DEFAULT_PORT; + this.paiTrainingService = component.get(PAITrainingService); + } + + /** + * NNIRestServer's own router registration + */ + protected registerRestHandler(): void { + this.app.use(bodyParser.json()); + this.app.use(this.API_ROOT_URL, this.createRestHandler()); + } + + private createRestHandler() : Router { + const router: Router = Router(); + + // tslint:disable-next-line:typedef + router.use((req: Request, res: Response, next) => { + this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`); + res.setHeader('Content-Type', 'application/json'); + next(); + }); + + router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => { + try { + this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`); + this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`); + + // Split metrics array into single metric, then emit + // Warning: If not split metrics into single ones, the behavior will be UNKNOWN + for (const singleMetric of req.body.metrics) { + this.paiTrainingService.MetricsEmitter.emit('metric', { + id : req.body.jobId, + data : singleMetric + }); + } + + res.send(); + } + catch(err) { + this.log.error(`json parse metrics error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + return router; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts new file mode 100644 index 0000000000..efaa9a7c14 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -0,0 +1,345 @@ + +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict' + +import * as component from '../../common/component'; +import * as cpp from 'child-process-promise'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as request from 'request'; + +import { Deferred } from 'ts-deferred'; +import { EventEmitter } from 'events'; +import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors'; +import { getLogger, Logger } from '../../common/log'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { + HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, + TrialJobDetail, TrialJobMetric, TrialJobStatus +} from '../../common/trainingService'; +import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; +import { ObservableTimer } from '../../common/observableTimer'; +import { PAIJobRestServer } from './paiJobRestServer' +import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; +import { PAIJobInfoCollector } from './paiJobInfoCollector'; +import { String } from 'typescript-string-operations'; +import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; +import { HDFSClientUtility } from './hdfsClientUtility' +import { getExperimentId } from '../../common/experimentStartupInfo'; + + +var WebHDFS = require('webhdfs'); + +/** + * Training Service implementation for OpenPAI (Open Platform for AI) + * Refer https://github.com/Microsoft/pai for more info about OpenPAI + */ +@component.Singleton +class PAITrainingService implements TrainingService { + private readonly log!: Logger; + private readonly metricsEmitter: EventEmitter; + private readonly trialJobsMap: Map; + private readonly expRootDir: string; + private paiTrialConfig: NNIPAITrialConfig | undefined; + private paiClusterConfig?: PAIClusterConfig; + private stopping: boolean = false; + private hdfsClient: any; + private paiToken? : string; + private experimentId! : string; + private readonly paiJobCollector : PAIJobInfoCollector; + + constructor() { + this.log = getLogger(); + this.metricsEmitter = new EventEmitter(); + this.trialJobsMap = new Map(); + // Root dir on HDFS + this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); + this.experimentId = getExperimentId(); + this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); + } + + public async run(): Promise { + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + await restServer.start(); + this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); + while (!this.stopping) { + await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig); + await delay(3000); + } + } + + public async listTrialJobs(): Promise { + const jobs: TrialJobDetail[] = []; + + this.trialJobsMap.forEach(async (value: PAITrialJobDetail, key: string) => { + if (value.form.jobType === 'TRIAL') { + jobs.push(await this.getTrialJob(key)); + } + }); + + return Promise.resolve(jobs); + } + + public getTrialJob(trialJobId: string): Promise { + if(!this.paiClusterConfig) { + throw new Error('PAI Cluster config is not initialized'); + } + + const paiTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (!paiTrialJob) { + return Promise.reject(`trial job ${trialJobId} not found`) + } + + return Promise.resolve(paiTrialJob); + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) { + this.metricsEmitter.on('metric', listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) { + this.metricsEmitter.off('metric', listener); + } + + public async submitTrialJob(form: JobApplicationForm): Promise { + const deferred : Deferred = new Deferred(); + if(!this.paiClusterConfig) { + throw new Error('PAI Cluster config is not initialized'); + } + if (!this.paiTrialConfig) { + throw new Error('trial config is not initialized'); + } + if (!this.paiToken) { + throw new Error('PAI token is not initialized'); + } + + this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); + + const trialJobId: string = uniqueString(5); + //TODO: use HDFS working folder instead + const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); + + const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); + //create tmp trial working folder locally. + await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); + await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`); + + // Write file content ( parameter.cfg ) to local tmp folders + const trialForm : TrialJobApplicationForm = (form) + if(trialForm) { + await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), trialForm.hyperParameters, { encoding: 'utf8' }); + } + + // Step 1. Prepare PAI job configuration + const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); + + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( + trialJobId, + 'WAITING', + paiJobName, + Date.now(), + trialWorkingFolder, + form); + this.trialJobsMap.set(trialJobId, trialJobDetail); + + const nniPaiTrialCommand : string = String.Format( + PAI_TRIAL_COMMAND_FORMAT, + // PAI will copy job's codeDir into /root directory + `/root/${trialJobId}`, + trialJobId, + this.experimentId, + this.paiTrialConfig.command, + getIPV4Address() + ).replace(/\r\n|\n|\r/gm, ''); + + console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); + const paiTaskRoles : PAITaskRole[] = [new PAITaskRole('nni_trail_' + trialJobId, + // Task role number + 1, + // Task CPU number + this.paiTrialConfig.cpuNum, + // Task memory + this.paiTrialConfig.memoryMB, + // Task GPU number + this.paiTrialConfig.gpuNum, + // Task command + nniPaiTrialCommand)]; + + const paiJobConfig : PAIJobConfig = new PAIJobConfig( + // Job name + paiJobName, + // Docker image + this.paiTrialConfig.image, + // dataDir + this.paiTrialConfig.dataDir, + // outputDir + this.paiTrialConfig.outputDir, + // codeDir + `$PAI_DEFAULT_FS_URI${hdfsCodeDir}`, + // TODO: Add Virutal Cluster + // PAI Task roles + paiTaskRoles); + console.log(`PAI job config is ${JSON.stringify(paiJobConfig)}`); + console.log(`Before submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); + + // Step 2. Upload code files in codeDir onto HDFS + try { + await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient); + } catch (error) { + this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); + throw new Error(error.message); + } + + // Step 3. Submit PAI job via Rest call + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const submitJobRequest: request.Options = { + uri: `http://${this.paiClusterConfig.host}:9186/api/v1/jobs`, + method: 'POST', + json: true, + body: paiJobConfig, + headers: { + "Content-Type": "application/json", + "Authorization": 'Bearer ' + this.paiToken + } + }; + request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + console.log(`After submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); + if (error || response.statusCode >= 400) { + this.log.error(`PAI Training service: Submit trial ${trialJobId} to PAI Cluster failed!`); + trialJobDetail.status = 'FAILED'; + deferred.reject(error ? error.message : 'Submit trial failed, http code: ' + response.statusCode); + } else { + trialJobDetail.submitTime = Date.now(); + deferred.resolve(trialJobDetail); + } + }); + + return deferred.promise; + } + + public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise { + throw new MethodNotImplementedError(); + } + + public get isMultiPhaseJobSupported(): boolean { + return false; + } + + public cancelTrialJob(trialJobId: string): Promise { + this.log.info(`PAI Training service cancelTrialJob: jobId: ${trialJobId}`); + const deferred : Deferred = new Deferred(); + + deferred.resolve(); + return deferred.promise; + } + + public setClusterMetadata(key: string, value: string): Promise { + const deferred : Deferred = new Deferred(); + + switch (key) { + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + //TODO: try catch exception when setting up HDFS client and get PAI token + this.paiClusterConfig = JSON.parse(value); + + this.hdfsClient = WebHDFS.createClient({ + user: this.paiClusterConfig.userName, + port: 50070, + host: this.paiClusterConfig.host + }); + + // Get PAI authentication token + const authentication_req: request.Options = { + uri: `http://${this.paiClusterConfig.host}:9186/api/v1/token`, + method: 'POST', + json: true, + body: { + username: this.paiClusterConfig.userName, + password: this.paiClusterConfig.passWord + } + }; + + request(authentication_req, (error: Error, response: request.Response, body: any) => { + if (error) { + //TODO: should me make the setClusterMetadata's return type to Promise? + this.log.error(`Get PAI token failed: ${error.message}`); + deferred.reject(); + } else { + if(response.statusCode !== 200){ + this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`); + deferred.reject(); + } + this.paiToken = body.token; + + console.log(`Got token ${this.paiToken} from PAI Cluster`); + deferred.resolve(); + } + }); + break; + case TrialConfigMetadataKey.TRIAL_CONFIG: + if (!this.paiClusterConfig){ + this.log.error('pai cluster config is not initialized'); + deferred.reject(); + break; + } + this.paiTrialConfig = JSON.parse(value); + console.log(`Set Cluster metadata: paiTrialConfig is ${JSON.stringify(this.paiTrialConfig)}`); + deferred.resolve(); + break; + default: + //Reject for unknown keys + throw new Error(`Uknown key: ${key}`); + } + + return deferred.promise; + } + + public getClusterMetadata(key: string): Promise { + const deferred : Deferred = new Deferred(); + + deferred.resolve(); + return deferred.promise; + } + + public async cleanUp(): Promise { + this.stopping = true; + + const deferred : Deferred = new Deferred(); + const restServer: PAIJobRestServer = component.get(PAIJobRestServer); + try { + await restServer.stop(); + deferred.resolve(); + this.log.info('PAI Training service rest server stopped successfully.'); + } catch (error) { + this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`); + deferred.reject(error); + } + + return deferred.promise; + } + + public get MetricsEmitter() : EventEmitter { + return this.metricsEmitter; + } +} + +export { PAITrainingService } \ No newline at end of file diff --git a/src/nni_manager/training_service/pai/paiTrialConfig.ts b/src/nni_manager/training_service/pai/paiTrialConfig.ts new file mode 100644 index 0000000000..583db9e725 --- /dev/null +++ b/src/nni_manager/training_service/pai/paiTrialConfig.ts @@ -0,0 +1,39 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import {TrialConfig} from '../common/trialConfig' + +export class PAITrialConfig extends TrialConfig{ + public readonly cpuNum: number; + public readonly memoryMB: number; + public readonly image: string; + public readonly dataDir: string; + public readonly outputDir: string; + + constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { + super(command, codeDir, gpuNum); + this.cpuNum = cpuNum; + this.memoryMB = memoryMB; + this.image = image; + this.dataDir = dataDir; + this.outputDir = outputDir; + } +} \ No newline at end of file diff --git a/src/nni_manager/training_service/remote_machine/metricsCollector.ts b/src/nni_manager/training_service/remote_machine/metricsCollector.ts index 3e346e7000..eb59a51d99 100644 --- a/src/nni_manager/training_service/remote_machine/metricsCollector.ts +++ b/src/nni_manager/training_service/remote_machine/metricsCollector.ts @@ -25,7 +25,8 @@ import * as path from 'path'; import { Client } from 'ssh2'; import { getLogger, Logger } from '../../common/log'; import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService'; -import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; +import { JobMetrics } from '../common/jobMetrics'; +import { RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; import { SSHClientUtility } from './sshClientUtility'; export class MetricsCollector { diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts index 1e52458790..0cd3a028dc 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts @@ -65,21 +65,6 @@ export class RemoteCommandResult { } } -// tslint:disable-next-line:max-classes-per-file -export class JobMetrics { - public readonly jobId: string; - public readonly metrics: string[]; - public readonly jobStatus: TrialJobStatus; - public readonly endTimestamp: number; - - constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) { - this.jobId = jobId; - this.metrics = metrics; - this.jobStatus = jobStatus; - this.endTimestamp = endTimestamp; - } -} - /** * RemoteMachineTrialJobDetail */ @@ -121,7 +106,7 @@ export enum ScheduleResultType { REQUIRE_EXCEED_TOTAL } -export const REMOTEMACHINERUNSHELLFORMAT: string = +export const REMOTEMACHINE_RUN_SHELL_FORMAT: string = `#!/bin/bash export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} cd $NNI_SYS_DIR @@ -129,7 +114,7 @@ echo $$ >{2} eval {3}{4} 2>{5} echo $? \`date +%s%3N\` >{6}`; -export const HOSTJOBSHELLFORMAT: string = +export const HOST_JOB_SHELL_FORMAT: string = `#!/bin/bash cd {0} echo $$ >{1} diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 772b93ff5d..e1cff16f22 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -43,8 +43,8 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { GPUScheduler } from './gpuScheduler'; import { MetricsCollector } from './metricsCollector'; import { - HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta, - REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, + HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta, + REMOTEMACHINE_RUN_SHELL_FORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType } from './remoteMachineData'; import { SSHClientUtility } from './sshClientUtility'; @@ -427,7 +427,7 @@ class RemoteMachineTrainingService implements TrainingService { // RemoteMachineRunShellFormat is the run shell format string, // See definition in remoteMachineData.ts const runScriptContent: string = String.Format( - REMOTEMACHINERUNSHELLFORMAT, + REMOTEMACHINE_RUN_SHELL_FORMAT, trialWorkingFolder, trialJobId, path.join(trialWorkingFolder, '.nni', 'jobpid'), @@ -470,7 +470,7 @@ class RemoteMachineTrainingService implements TrainingService { await cpp.exec(`mkdir -p ${localDir}`); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteDir}`, sshClient); const runScriptContent: string = String.Format( - HOSTJOBSHELLFORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code') + HOST_JOB_SHELL_FORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code') ); await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' }); await SSHClientUtility.copyFileToRemote( diff --git a/src/nni_manager/training_service/test/hdfsClientUtility.test.ts b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts new file mode 100644 index 0000000000..b8cf30e83a --- /dev/null +++ b/src/nni_manager/training_service/test/hdfsClientUtility.test.ts @@ -0,0 +1,143 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; +import * as chai from 'chai'; +import * as chaiAsPromised from 'chai-as-promised'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import * as tmp from 'tmp'; +import { cleanupUnitTest, prepareUnitTest, uniqueString } from '../../common/utils'; +import { HDFSClientUtility } from '../pai/hdfsClientUtility'; + +var WebHDFS = require('webhdfs'); +var rmdir = require('rmdir'); + +describe('WebHDFS', function () { + /* + To enable web HDFS client unit test, HDFS information needs to be configured in: + Default/.vscode/hdfsInfo.json, whose content looks like: + { + "user": "user1", + "port": 50070, + "host": "10.0.0.0" + } + */ + let skip: boolean = false; + let testHDFSInfo: any; + let hdfsClient: any; + try { + testHDFSInfo = JSON.parse(fs.readFileSync('../../.vscode/hdfsInfo.json', 'utf8')); + console.log(testHDFSInfo); + hdfsClient = WebHDFS.createClient({ + user: testHDFSInfo.user, + port: testHDFSInfo.port, + host: testHDFSInfo.host + }); + } catch (err) { + console.log('Please configure rminfo.json to enable remote machine unit test.'); + skip = true; + } + + before(() => { + chai.should(); + chai.use(chaiAsPromised); + tmp.setGracefulCleanup(); + prepareUnitTest(); + }); + + after(() => { + cleanupUnitTest(); + }); + + it('Test HDFS utility path functions', async () => { + if (skip) { + return; + } + const testPath : string = '/nni_unittest_' + uniqueString(6); + let exists : boolean = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The new random named path is expected to not exist + chai.expect(exists).to.be.equals(false); + + const mkdirResult : boolean = await HDFSClientUtility.mkdir(testPath, hdfsClient); + // Mkdir is expected to be successful + chai.expect(mkdirResult).to.be.equals(true); + + exists = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The newly created path is expected to exist + chai.expect(exists).to.be.equals(true); + + const deleteResult : boolean = await HDFSClientUtility.deletePath(testPath, hdfsClient); + // Delete path is expected to be successful + chai.expect(deleteResult).to.be.equals(true); + + exists = await HDFSClientUtility.pathExists(testPath, hdfsClient); + // The deleted path is not expected to exist + chai.expect(exists).to.be.equals(false); + }); + + it('Test HDFS utility copyFileToHdfs', async() => { + if (skip) { + return; + } + // Prepare local directory and files + const tmpLocalDirectoryPath : string = path.join(os.tmpdir(), 'nni_unittest_dir_' + uniqueString(6)); + const tmpDataFilePath : string = path.join(tmpLocalDirectoryPath, 'file_' + uniqueString(6)); + const testFileData : string = 'TestContent123'; + fs.mkdirSync(tmpLocalDirectoryPath); + fs.writeFileSync(tmpDataFilePath, testFileData); + + const testHDFSFilePath : string = '/nni_unittest_' + uniqueString(6); + let exists : boolean = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient); + // The new random named path is expected to not exist + chai.expect(exists).to.be.equals(false); + + await HDFSClientUtility.copyFileToHdfs(tmpDataFilePath, testHDFSFilePath, hdfsClient); + exists = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient); + // After copy local file to HDFS, the target file path in HDFS is expected to exist + chai.expect(exists).to.be.equals(true); + + const buffer : Buffer = await HDFSClientUtility.readFileFromHDFS(testHDFSFilePath, hdfsClient); + const actualFileData : string = buffer.toString('utf8'); + // The file content read from HDFS is expected to equal to the content of local file + chai.expect(actualFileData).to.be.equals(testFileData); + + const testHDFSDirPath : string = path.join('/nni_unittest_' + uniqueString(6) + '_dir'); + + await HDFSClientUtility.copyDirectoryToHdfs(tmpLocalDirectoryPath, testHDFSDirPath, hdfsClient); + + const files : any[] = await HDFSClientUtility.readdir(testHDFSDirPath, hdfsClient); + + // Expected file count under HDFS target directory is 1 + chai.expect(files.length).to.be.equals(1); + + // Expected file name under HDFS target directory is equal to local file name + chai.expect(files[0].pathSuffix).to.be.equals(path.parse(tmpDataFilePath).base); + + // Cleanup + rmdir(tmpLocalDirectoryPath); + + let deleteRestult : boolean = await HDFSClientUtility.deletePath(testHDFSFilePath, hdfsClient); + chai.expect(deleteRestult).to.be.equals(true); + + deleteRestult = await HDFSClientUtility.deletePath(testHDFSDirPath, hdfsClient); + chai.expect(deleteRestult).to.be.equals(true); + }); +}); \ No newline at end of file diff --git a/src/nni_manager/training_service/test/paiTrainingService.test.ts b/src/nni_manager/training_service/test/paiTrainingService.test.ts new file mode 100644 index 0000000000..4294e4ddc1 --- /dev/null +++ b/src/nni_manager/training_service/test/paiTrainingService.test.ts @@ -0,0 +1,95 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as chai from 'chai'; +import * as chaiAsPromised from 'chai-as-promised'; +import * as fs from 'fs'; +import * as tmp from 'tmp'; +import * as component from '../../common/component'; +import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { PAITrainingService } from '../pai/paiTrainingService'; + +// TODO: copy mockedTrail.py to local folder +const localCodeDir: string = tmp.dirSync().name +const mockedTrialPath: string = './training_service/test/mockedTrial.py' +fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py') + +describe('Unit Test for PAITrainingService', () => { + let skip: boolean = false; + let testPaiClusterInfo: any; + let paiCluster: any; + let paiTrialConfig : any; + try { + testPaiClusterInfo = JSON.parse(fs.readFileSync('../../.vscode/paiCluster.json', 'utf8')); + paiCluster = `{\"userName\":\"${testPaiClusterInfo.userName}\",\"passWord\":\"${testPaiClusterInfo.passWord}\",\"host\":\"${testPaiClusterInfo.host}\"}`; + paiTrialConfig = `{\"command\":\"echo hello && ls\",\"codeDir\":\"/home/desy/nni/examples/trials/mnist",\"gpuNum\":\"1\", +\"cpuNum\":\"1\",\"memoryMB\":\"8196\",\"image\":\"openpai/pai.example.tensorflow\",\"dataDir\":\"\",\"outputDir\":\"\"}`; + } catch (err) { + console.log('Please configure rminfo.json to enable remote machine unit test.'); + skip = true; + } + + let paiTrainingService: PAITrainingService; + + console.log(tmp.dirSync().name); + + before(() => { + chai.should(); + chai.use(chaiAsPromised); + prepareUnitTest(); + }); + + after(() => { + cleanupUnitTest(); + }); + + beforeEach(() => { + if (skip) { + return; + } + paiTrainingService = component.get(PAITrainingService); + paiTrainingService.run(); + }); + + afterEach(() => { + if (skip) { + return; + } + paiTrainingService.cleanUp(); + }); + + it('Get PAI token', async () => { + if (skip) { + return; + } + console.log(`paiCluster is ${paiCluster}`) + await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.PAI_CLUSTER_CONFIG, paiCluster); + await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, paiTrialConfig); + try { + const trialDetail = await paiTrainingService.submitTrialJob({jobType : 'TRIAL'}); + chai.expect(trialDetail.status).to.be.equals('WAITING'); + } catch(error) { + console.log('Submit job failed:' + error); + chai.assert(error) + } + }); +}); \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/setup.py b/src/nni_manager/training_service_tool/setup.py new file mode 100644 index 0000000000..a65a79263e --- /dev/null +++ b/src/nni_manager/training_service_tool/setup.py @@ -0,0 +1,20 @@ +import setuptools + +setuptools.setup( + # NNI Training Service(nnits) package + name = 'nnits-tool', + version = '0.0.1', + packages = setuptools.find_packages(), + + python_requires = '>=3.5', + install_requires = [ + 'requests', + 'psutil' + ], + + author = 'Microsoft NNI Team', + author_email = 'nni@microsoft.com', + description = 'NNI Training Service Tool for Neural Network Intelligence project', + license = 'MIT', + url = 'https://github.com/Microsoft/nni' +) \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/__init__.py b/src/nni_manager/training_service_tool/trial/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/nni_manager/training_service_tool/trial/constants.py b/src/nni_manager/training_service_tool/trial/constants.py new file mode 100644 index 0000000000..7ff3d7847f --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/constants.py @@ -0,0 +1,37 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import os + +API_ROOT_URL = '/api/v1/nni-pai' + +BASE_URL = 'http://{}' + +DEFAULT_REST_PORT = 51189 + +HOME_DIR = os.path.join(os.environ['HOME'], 'nni') + +LOG_DIR = os.path.join(HOME_DIR, 'trial-keeper', 'log') + +STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout') + +STDERR_FULL_PATH = os.path.join(LOG_DIR, 'stderr') + +UPDATE_METRICS_API = '/update-metrics' \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/src/nni_manager/training_service_tool/trial/metrics_reader.py new file mode 100644 index 0000000000..6178e657a3 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/metrics_reader.py @@ -0,0 +1,124 @@ +# ============================================================================================================================== # +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ============================================================================================================================== # + +import argparse +import errno +import json +import os +import re +import requests + +from .constants import BASE_URL, DEFAULT_REST_PORT +from .rest_utils import rest_get, rest_post, rest_put, rest_delete +from .url_utils import gen_update_metrics_url + +NNI_SYS_DIR = os.environ['NNI_SYS_DIR'] +NNI_TRIAL_JOB_ID = os.environ['NNI_TRIAL_JOB_ID'] +NNI_EXP_ID = os.environ['NNI_EXP_ID'] +LEN_FIELD_SIZE = 6 +MAGIC = 'ME' + +print('In metrics_reader, NNI_SYS_DIR is {}'.format(NNI_SYS_DIR)) + +class TrialMetricsReader(): + ''' + Read metrics data from a trial job + ''' + def __init__(self, rest_port = DEFAULT_REST_PORT): + self.offset_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics_offset') + self.metrics_filename = os.path.join(NNI_SYS_DIR, '.nni', 'metrics') + self.rest_port = rest_port + + def _metrics_file_is_empty(self): + if not os.path.isfile(self.metrics_filename): + return True + statinfo = os.stat(self.metrics_filename) + return statinfo.st_size == 0 + + def _get_offset(self): + offset = 0 + if os.path.isfile(self.offset_filename): + with open(self.offset_filename, 'r') as f: + offset = int(f.readline()) + return offset + + def _write_offset(self, offset): + statinfo = os.stat(self.metrics_filename) + if offset < 0 or offset > statinfo.st_size: + raise ValueError('offset value is invalid: {}'.format(offset)) + + with open(self.offset_filename, 'w') as f: + f.write(str(offset)+'\n') + + def _read_all_available_records(self, offset): + new_offset = offset + metrics = [] + with open(self.metrics_filename, 'r') as f: + print('offset is {}'.format(offset)) + f.seek(offset) + while True: + magic_string = f.read(len(MAGIC)) + # empty data means EOF + if not magic_string: + break + strdatalen = f.read(LEN_FIELD_SIZE) + # empty data means EOF + if not strdatalen: + raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset)) + datalen = int(strdatalen) + data = f.read(datalen) + + if datalen > 0 and len(data) == datalen: + print('data is \'{}\''.format(data)) + new_offset = f.tell() + metrics.append(data) + else: + raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset)) + self._write_offset(new_offset) + return metrics + + def read_trial_metrics(self): + ''' + Read available metrics data for a trial + ''' + if self._metrics_file_is_empty(): + print('metrics is empty') + return [] + + offset = self._get_offset() + return self._read_all_available_records(offset) + +def read_experiment_metrics(nnimanager_ip): + ''' + Read metrics data for specified trial jobs + ''' + result = {} + try: + reader = TrialMetricsReader() + result['jobId'] = NNI_TRIAL_JOB_ID + result['metrics'] = reader.read_trial_metrics() + print('Result metrics is {}'.format(json.dumps(result))) + if len(result['metrics']) > 0: + response = rest_post(gen_update_metrics_url(BASE_URL.format(nnimanager_ip), DEFAULT_REST_PORT, NNI_EXP_ID, NNI_TRIAL_JOB_ID), json.dumps(result), 10) + print('Response code is {}'.format(response.status_code)) + except Exception: + #TODO error logging to file + pass + + return json.dumps(result) \ No newline at end of file diff --git a/src/nni_manager/training_service_tool/trial/rest_utils.py b/src/nni_manager/training_service_tool/trial/rest_utils.py new file mode 100644 index 0000000000..f506653c4e --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/rest_utils.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +import time +import requests + +def rest_get(url, timeout): + '''Call rest get method''' + try: + response = requests.get(url, timeout=timeout) + return response + except Exception: + return None + +def rest_post(url, data, timeout): + '''Call rest post method''' + try: + response = requests.post(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ + data=data, timeout=timeout) + return response + except Exception: + return None + +def rest_put(url, data, timeout): + '''Call rest put method''' + try: + response = requests.put(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\ + data=data, timeout=timeout) + return response + except Exception: + return None + +def rest_delete(url, timeout): + '''Call rest delete method''' + try: + response = requests.delete(url, timeout=timeout) + return response + except Exception: + return None diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py new file mode 100644 index 0000000000..6331ac4ff1 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -0,0 +1,77 @@ +# ============================================================================================================================== # +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# ============================================================================================================================== # + +import argparse +import sys +import os +from subprocess import Popen, PIPE +import time +import logging +import shlex + +from .constants import HOME_DIR, LOG_DIR, STDOUT_FULL_PATH, STDERR_FULL_PATH +from .metrics_reader import read_experiment_metrics + +logger = logging.getLogger('trial_keeper') + +def main_loop(args): + '''main loop logic for trial keeper''' + + if not os.path.exists(LOG_DIR): + os.makedirs(LOG_DIR) + + stdout_file = open(STDOUT_FULL_PATH, 'a+') + stderr_file = open(STDERR_FULL_PATH, 'a+') + print(shlex.split(args.trial_command)) + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior + process = Popen(args.trial_command, shell = True, stdout = stdout_file, stderr = stderr_file) + print('Subprocess pid is {}'.format(process.pid)) + print('Current cwd is {}'.format(os.getcwd())) + while True: + retCode = process.poll() + ## Read experiment metrics, to avoid missing metrics + read_experiment_metrics(args.nnimanager_ip) + + if retCode is not None: + print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) + break + else: + print('subprocess pid: {} is still alive'.format(process.pid)) + + time.sleep(2) + +def trial_keeper_help_info(*args): + print('please run --help to see guidance') + +if __name__ == '__main__': + '''NNI Trial Keeper main function''' + PARSER = argparse.ArgumentParser() + PARSER.set_defaults(func=trial_keeper_help_info) + PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') + PARSER.add_argument('--nnimanager_ip', type=str, default='localhost', help='NNI manager IP') + args, unknown = PARSER.parse_known_args() + if args.trial_command is None: + exit(1) + + try: + main_loop(args) + except: + print('Exiting by user request') + sys.exit(1) + diff --git a/src/nni_manager/training_service_tool/trial/url_utils.py b/src/nni_manager/training_service_tool/trial/url_utils.py new file mode 100644 index 0000000000..69ce14ecb2 --- /dev/null +++ b/src/nni_manager/training_service_tool/trial/url_utils.py @@ -0,0 +1,25 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, +# to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from .constants import API_ROOT_URL, UPDATE_METRICS_API + +def gen_update_metrics_url(base_url, port, exp_id, trial_job_id): + '''Generate update trial metrics url''' + return '{0}:{1}{2}{3}/{4}/:{5}'.format(base_url, port, API_ROOT_URL, UPDATE_METRICS_API, exp_id, trial_job_id) \ No newline at end of file diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index 8611053414..b8ca788520 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -224,7 +224,7 @@ accepts@~1.3.5: mime-types "~2.1.18" negotiator "0.6.1" -ajv@^5.1.0: +ajv@^5.1.0, ajv@^5.3.0: version "5.5.2" resolved "https://registry.yarnpkg.com/ajv/-/ajv-5.5.2.tgz#73b5eeca3fab653e3d3f9422b341ad42205dc965" dependencies: @@ -310,6 +310,10 @@ aws4@^1.6.0: version "1.7.0" resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.7.0.tgz#d4d0e9b9dbfca77bf08eeb0a8a471550fe39e289" +aws4@^1.8.0: + version "1.8.0" + resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.8.0.tgz#f0e003d9ca9e7f59c7a508945d7b2ef9a04a542f" + babel-code-frame@^6.22.0: version "6.26.0" resolved "https://registry.yarnpkg.com/babel-code-frame/-/babel-code-frame-6.26.0.tgz#63fd43f7dc1e3bb7ce35947db8fe369a3f58c74b" @@ -364,6 +368,10 @@ buffer-from@^1.0.0, buffer-from@^1.1.0: version "1.1.1" resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.1.tgz#32713bc028f75c02fdb710d7c7bcec1f2c6070ef" +buffer-stream-reader@^0.1.1: + version "0.1.1" + resolved "https://registry.yarnpkg.com/buffer-stream-reader/-/buffer-stream-reader-0.1.1.tgz#ca8bf93631deedd8b8f8c3bb44991cc30951e259" + builtin-modules@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/builtin-modules/-/builtin-modules-1.1.1.tgz#270f076c5a72c02f5b65a47df94c5fe3a278892f" @@ -455,6 +463,12 @@ combined-stream@1.0.6, combined-stream@~1.0.5: dependencies: delayed-stream "~1.0.0" +combined-stream@~1.0.6: + version "1.0.7" + resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.7.tgz#2d1d24317afb8abe95d6d2c0b07b57813539d828" + dependencies: + delayed-stream "~1.0.0" + commander@2.15.1: version "2.15.1" resolved "https://registry.yarnpkg.com/commander/-/commander-2.15.1.tgz#df46e867d0fc2aec66a34662b406a9ccafff5b0f" @@ -635,7 +649,7 @@ extend@2.0.x: version "2.0.2" resolved "https://registry.yarnpkg.com/extend/-/extend-2.0.2.tgz#1b74985400171b85554894459c978de6ef453ab7" -extend@~3.0.1: +extend@^3.0.0, extend@~3.0.1, extend@~3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/extend/-/extend-3.0.2.tgz#f8b1136b4071fbd8eb140aff858b1019ec2915fa" @@ -671,7 +685,7 @@ forever-agent@~0.6.1: version "0.6.1" resolved "https://registry.yarnpkg.com/forever-agent/-/forever-agent-0.6.1.tgz#fbc71f0c41adeb37f96c577ad1ed42d8fdacca91" -form-data@~2.3.1: +form-data@~2.3.1, form-data@~2.3.2: version "2.3.2" resolved "https://registry.yarnpkg.com/form-data/-/form-data-2.3.2.tgz#4970498be604c20c005d4f5c23aecd21d6b49099" dependencies: @@ -763,6 +777,13 @@ har-validator@~5.0.3: ajv "^5.1.0" har-schema "^2.0.0" +har-validator@~5.1.0: + version "5.1.0" + resolved "https://registry.yarnpkg.com/har-validator/-/har-validator-5.1.0.tgz#44657f5688a22cfd4b72486e81b3a3fb11742c29" + dependencies: + ajv "^5.3.0" + har-schema "^2.0.0" + has-ansi@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/has-ansi/-/has-ansi-2.0.0.tgz#34f5049ce1ecdf2b0649af3ef24e45ed35416d91" @@ -870,6 +891,10 @@ is-typedarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/is-typedarray/-/is-typedarray-1.0.0.tgz#e479c80858df0c1b11ddda6940f96011fcda4a9a" +is@~0.2.6: + version "0.2.7" + resolved "http://registry.npmjs.org/is/-/is-0.2.7.tgz#3b34a2c48f359972f35042849193ae7264b63562" + isarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" @@ -958,12 +983,22 @@ mime-db@~1.35.0: version "1.35.0" resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.35.0.tgz#0569d657466491283709663ad379a99b90d9ab47" +mime-db@~1.36.0: + version "1.36.0" + resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.36.0.tgz#5020478db3c7fe93aad7bbcc4dcf869c43363397" + mime-types@^2.1.12, mime-types@~2.1.17, mime-types@~2.1.18: version "2.1.19" resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.19.tgz#71e464537a7ef81c15f2db9d97e913fc0ff606f0" dependencies: mime-db "~1.35.0" +mime-types@~2.1.19: + version "2.1.20" + resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.20.tgz#930cb719d571e903738520f8470911548ca2cc19" + dependencies: + mime-db "~1.36.0" + mime@1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/mime/-/mime-1.4.1.tgz#121f9ebc49e3766f311a76e1fa1c8003c4b03aa6" @@ -1066,6 +1101,19 @@ node-version@^1.0.0: version "1.2.0" resolved "https://registry.yarnpkg.com/node-version/-/node-version-1.2.0.tgz#34fde3ffa8e1149bd323983479dda620e1b5060d" +node.extend@1.0.8: + version "1.0.8" + resolved "https://registry.yarnpkg.com/node.extend/-/node.extend-1.0.8.tgz#bab04379f7383f4587990c9df07b6a7f65db772b" + dependencies: + is "~0.2.6" + object-keys "~0.4.0" + +node.flow@1.2.3: + version "1.2.3" + resolved "https://registry.yarnpkg.com/node.flow/-/node.flow-1.2.3.tgz#e1c44a82aeca8d78b458a77fb3dc642f2eba2649" + dependencies: + node.extend "1.0.8" + nopt@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/nopt/-/nopt-4.0.1.tgz#d0d4685afd5415193c8c7505602d0d17cd64474d" @@ -1101,10 +1149,18 @@ oauth-sign@~0.8.2: version "0.8.2" resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.8.2.tgz#46a6ab7f0aead8deae9ec0565780b7d4efeb9d43" +oauth-sign@~0.9.0: + version "0.9.0" + resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.9.0.tgz#47a7b016baa68b5fa0ecf3dee08a85c679ac6455" + object-assign@^4.0.1, object-assign@^4.1.0: version "4.1.1" resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863" +object-keys@~0.4.0: + version "0.4.0" + resolved "https://registry.yarnpkg.com/object-keys/-/object-keys-0.4.0.tgz#28a6aae7428dd2c3a92f3d95f21335dd204e0336" + on-finished@~2.3.0: version "2.3.0" resolved "https://registry.yarnpkg.com/on-finished/-/on-finished-2.3.0.tgz#20f1336481b083cd75337992a16971aa2d906947" @@ -1199,6 +1255,10 @@ pseudomap@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/pseudomap/-/pseudomap-1.0.2.tgz#f052a28da70e618917ef0a8ac34c1ae5a68286b3" +psl@^1.1.24: + version "1.1.29" + resolved "https://registry.yarnpkg.com/psl/-/psl-1.1.29.tgz#60f580d360170bb722a797cc704411e6da850c67" + punycode@^1.4.1: version "1.4.1" resolved "https://registry.yarnpkg.com/punycode/-/punycode-1.4.1.tgz#c0d5a63b2718800ad8e1eb0fa5269c84dd41845e" @@ -1207,7 +1267,7 @@ qs@6.5.1: version "6.5.1" resolved "https://registry.yarnpkg.com/qs/-/qs-6.5.1.tgz#349cdf6eef89ec45c12d7d5eb3fc0c870343a6d8" -qs@~6.5.1: +qs@~6.5.1, qs@~6.5.2: version "6.5.2" resolved "https://registry.yarnpkg.com/qs/-/qs-6.5.2.tgz#cb3ae806e8740444584ef154ce8ee98d403f3e36" @@ -1249,6 +1309,31 @@ reflect-metadata@^0.1.10: version "0.1.12" resolved "https://registry.yarnpkg.com/reflect-metadata/-/reflect-metadata-0.1.12.tgz#311bf0c6b63cd782f228a81abe146a2bfa9c56f2" +request@^2.74.0: + version "2.88.0" + resolved "https://registry.yarnpkg.com/request/-/request-2.88.0.tgz#9c2fca4f7d35b592efe57c7f0a55e81052124fef" + dependencies: + aws-sign2 "~0.7.0" + aws4 "^1.8.0" + caseless "~0.12.0" + combined-stream "~1.0.6" + extend "~3.0.2" + forever-agent "~0.6.1" + form-data "~2.3.2" + har-validator "~5.1.0" + http-signature "~1.2.0" + is-typedarray "~1.0.0" + isstream "~0.1.2" + json-stringify-safe "~5.0.1" + mime-types "~2.1.19" + oauth-sign "~0.9.0" + performance-now "^2.1.0" + qs "~6.5.2" + safe-buffer "^5.1.2" + tough-cookie "~2.4.3" + tunnel-agent "^0.6.0" + uuid "^3.3.2" + request@^2.87.0: version "2.87.0" resolved "https://registry.yarnpkg.com/request/-/request-2.87.0.tgz#32f00235cd08d482b4d0d68db93a829c0ed5756e" @@ -1294,6 +1379,12 @@ rimraf@^2.6.1: dependencies: glob "^7.0.5" +rmdir@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/rmdir/-/rmdir-1.2.0.tgz#4fe0357cb06168c258e73e968093dc4e8a0f3253" + dependencies: + node.flow "1.2.3" + rx@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/rx/-/rx-4.1.0.tgz#a5f13ff79ef3b740fe30aa803fb09f98805d4782" @@ -1510,6 +1601,13 @@ tough-cookie@~2.3.3: dependencies: punycode "^1.4.1" +tough-cookie@~2.4.3: + version "2.4.3" + resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-2.4.3.tgz#53f36da3f47783b0925afa06ff9f3b165280f781" + dependencies: + psl "^1.1.24" + punycode "^1.4.1" + tree-kill@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/tree-kill/-/tree-kill-1.2.0.tgz#5846786237b4239014f05db156b643212d4c6f36" @@ -1612,7 +1710,7 @@ utils-merge@1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/utils-merge/-/utils-merge-1.0.1.tgz#9f95710f50a267947b2ccc124741c1028427e713" -uuid@^3.1.0: +uuid@^3.1.0, uuid@^3.3.2: version "3.3.2" resolved "https://registry.yarnpkg.com/uuid/-/uuid-3.3.2.tgz#1b4af4955eb3077c501c23872fc6513811587131" @@ -1628,6 +1726,14 @@ verror@1.10.0: core-util-is "1.0.2" extsprintf "^1.2.0" +webhdfs@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/webhdfs/-/webhdfs-1.2.0.tgz#c41b08ae33944a0220863bfd4b6719b9aaec1d37" + dependencies: + buffer-stream-reader "^0.1.1" + extend "^3.0.0" + request "^2.74.0" + which@^1.2.9: version "1.3.1" resolved "https://registry.yarnpkg.com/which/-/which-1.3.1.tgz#a45043d54f5805316da8d62f9f50918d3da70b0a" diff --git a/src/sdk/pynni/nni/platform/__init__.py b/src/sdk/pynni/nni/platform/__init__.py index e0b44e49cb..fed452fc47 100644 --- a/src/sdk/pynni/nni/platform/__init__.py +++ b/src/sdk/pynni/nni/platform/__init__.py @@ -27,7 +27,7 @@ from .standalone import * elif env_args.platform == 'unittest': from .test import * -elif env_args.platform in ('local', 'remote'): +elif env_args.platform in ('local', 'remote', 'pai'): from .local import * else: raise RuntimeError('Unknown platform %s' % env_args.platform) diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index 8cd8431151..86164f6a88 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -50,7 +50,12 @@ 'trial':{ 'command': str, 'codeDir': os.path.exists, - 'gpuNum': And(int, lambda x: 0 <= x <= 99999) + 'gpuNum': And(int, lambda x: 0 <= x <= 99999), + Optional('cpuNum'): And(int, lambda x: 0 <= x <= 99999), + Optional('memoryMB'): int, + Optional('image'): str, + Optional('dataDir'): str, + Optional('outputDir'): str }, Optional('assessor'): Or({ 'builtinAssessorName': lambda x: x in ['Medianstop'], @@ -77,36 +82,9 @@ 'sshKeyPath': os.path.exists, Optional('passphrase'): str })], -Optional('pai'): -{ - 'jobName': str, - "image": str, - "authFile": os.path.exists, - "dataDir": os.path.exists, - "outputDir": os.path.exists, - "codeDir": os.path.exists, - "virtualCluster": str, - "taskRoles": [ - { - "name": str, - "taskNumber": And(int, lambda x: 0 <= x <= 99999), - "cpuNumber": And(int, lambda x: 0 <= x <= 99999), - "memoryMB": And(int, lambda x: 0 <= x <= 99999), - "shmMB": And(int, lambda x: 0 <= x <= 99999), - "gpuNumber": And(int, lambda x: 0 <= x <= 99999), - "portList": [ - { - "label": str, - "beginAt": str, - "portNumber": And(int, lambda x: 0 < x < 65535) - } - ], - "command": str, - "minFailedTaskCount": And(int, lambda x: 0 <= x <= 99999), - "minSucceededTaskCount": And(int, lambda x: 0 <= x <= 99999) - } - ], - "gpuType": str, - "retryCount": And(int, lambda x: 0 <= x <= 99999) +Optional('paiConfig'):{ + 'userName': str, + 'passWord': str, + 'host': str } }) \ No newline at end of file diff --git a/tools/nnicmd/launcher.py b/tools/nnicmd/launcher.py index b99a428b1d..25539693ad 100644 --- a/tools/nnicmd/launcher.py +++ b/tools/nnicmd/launcher.py @@ -64,6 +64,16 @@ def set_trial_config(experiment_config, port): value_dict['command'] = experiment_config['trial']['command'] value_dict['codeDir'] = experiment_config['trial']['codeDir'] value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] + if experiment_config['trial'].get('cpuNum'): + value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] + if experiment_config['trial'].get('memoryMB'): + value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] + if experiment_config['trial'].get('image'): + value_dict['image'] = experiment_config['trial']['image'] + if experiment_config['trial'].get('dataDir'): + value_dict['dataDir'] = experiment_config['trial']['dataDir'] + if experiment_config['trial'].get('outputDir'): + value_dict['outputDir'] = experiment_config['trial']['outputDir'] request_data['trial_config'] = value_dict response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) return True if response.status_code == 200 else False @@ -87,6 +97,20 @@ def set_remote_config(experiment_config, port): #set trial_config return set_trial_config(experiment_config, port), err_message +def set_pai_config(experiment_config, port): + '''set pai configuration''' + pai_config_data = dict() + pai_config_data['pai_config'] = experiment_config['paiConfig'] + response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), 20) + err_message = '' + if not response or not response.status_code == 200: + if response is not None: + err_message = response.text + return False, err_message + + #set trial_config + return set_trial_config(experiment_config, port), err_message + def set_experiment(experiment_config, mode, port): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() @@ -106,7 +130,7 @@ def set_experiment(experiment_config, mode, port): {'key':'codeDir', 'value':experiment_config['trial']['codeDir']}) request_data['clusterMetaData'].append( {'key': 'command', 'value': experiment_config['trial']['command']}) - else: + elif experiment_config['trainingServicePlatform'] == 'remote': request_data['clusterMetaData'].append( {'key': 'machine_list', 'value': experiment_config['machineList']}) value_dict = dict() @@ -115,6 +139,20 @@ def set_experiment(experiment_config, mode, port): value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] request_data['clusterMetaData'].append( {'key': 'trial_config', 'value': value_dict}) + elif experiment_config['trainingServicePlatform'] == 'pai': + request_data['clusterMetaData'].append( + {'key': 'pai_config', 'value': experiment_config['paiConfig']}) + value_dict = dict() + value_dict['command'] = experiment_config['trial']['command'] + value_dict['codeDir'] = experiment_config['trial']['codeDir'] + value_dict['gpuNum'] = experiment_config['trial']['gpuNum'] + value_dict['cpuNum'] = experiment_config['trial']['cpuNum'] + value_dict['memoryMB'] = experiment_config['trial']['memoryMB'] + value_dict['image'] = experiment_config['trial']['image'] + value_dict['dataDir'] = experiment_config['trial']['dataDir'] + value_dict['outputDir'] = experiment_config['trial']['outputDir'] + request_data['clusterMetaData'].append( + {'key': 'trial_config', 'value': value_dict}) response = rest_post(experiment_url(port), json.dumps(request_data), 20) return response if response.status_code == 200 else None @@ -183,6 +221,21 @@ def launch_experiment(args, experiment_config, mode, webuiport, experiment_id=No except Exception: raise Exception(ERROR_INFO % 'Rest server stopped!') exit(0) + + #set pai config + if experiment_config['trainingServicePlatform'] == 'pai': + print_normal('Setting pai config...') + config_result, err_msg = set_pai_config(experiment_config, REST_PORT) + if config_result: + print_normal('Success!') + else: + print_error('Failed! Error is: {}'.format(err_msg)) + try: + cmds = ['pkill', '-P', str(rest_process.pid)] + call(cmds) + except Exception: + raise Exception(ERROR_INFO % 'Rest server stopped!') + exit(0) # start a new experiment print_normal('Starting experiment...') From aa4f306d6474fbd09ea6e10fa9a3f58adf731fca Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 18:47:26 +0800 Subject: [PATCH 38/60] fix conflict --- .../training_service/pai/paiTrainingService.ts | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 18b31d481b..f05d32021f 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -38,11 +38,7 @@ import { import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { ObservableTimer } from '../../common/observableTimer'; import { PAIJobRestServer } from './paiJobRestServer' -<<<<<<< HEAD import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT } from './paiData'; -======= -import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; ->>>>>>> fad2ba359d74a8fcac6b8d1e34571b7206a3b50d import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { String } from 'typescript-string-operations'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; @@ -156,7 +152,6 @@ class PAITrainingService implements TrainingService { // Step 1. Prepare PAI job configuration const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); -<<<<<<< HEAD //get hdfs url const hdfsURLPattern: string = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' const hdfsHostURL = this.paiTrialConfig.outputDir.match(hdfsURLPattern) @@ -174,9 +169,6 @@ class PAITrainingService implements TrainingService { //get hdfsOUtputDir const hdfsBaseDirectory = this.paiTrialConfig.outputDir.replace(hdfsHostURL[0], "") const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) -======= ->>>>>>> fad2ba359d74a8fcac6b8d1e34571b7206a3b50d - const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', @@ -192,16 +184,11 @@ class PAITrainingService implements TrainingService { `/root/${trialJobId}`, trialJobId, this.experimentId, -<<<<<<< HEAD hdfsHost[0], this.paiClusterConfig.userName, this.paiTrialConfig.command, getIPV4Address(), hdfsOutputDir -======= - this.paiTrialConfig.command, - getIPV4Address() ->>>>>>> fad2ba359d74a8fcac6b8d1e34571b7206a3b50d ).replace(/\r\n|\n|\r/gm, ''); console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); From 45c96004fc9a3e5437fc94849db8a3a078fb0109 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Tue, 25 Sep 2018 18:48:56 +0800 Subject: [PATCH 39/60] fix conflict --- src/nni_manager/training_service_tool/trial/trial_keeper.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 279c3018c0..a536097cc9 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -24,14 +24,10 @@ import time import logging import shlex -<<<<<<< HEAD import re from pyhdfs import HdfsClient from .hdfsClientUtility import copyDirectoryToHdfs -======= - ->>>>>>> fad2ba359d74a8fcac6b8d1e34571b7206a3b50d from .constants import HOME_DIR, LOG_DIR, STDOUT_FULL_PATH, STDERR_FULL_PATH from .metrics_reader import read_experiment_metrics From 24dd1b634be19220a505f08161c6ef1081e9bff5 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Tue, 25 Sep 2018 18:49:17 +0800 Subject: [PATCH 40/60] Remove unused import and paiTrialConfig file --- .../training_service/common/jobMetrics.ts | 2 +- .../pai/paiJobInfoCollector.ts | 1 - .../pai/paiTrainingService.ts | 17 +++----- .../training_service/pai/paiTrialConfig.ts | 39 ------------------- 4 files changed, 6 insertions(+), 53 deletions(-) delete mode 100644 src/nni_manager/training_service/pai/paiTrialConfig.ts diff --git a/src/nni_manager/training_service/common/jobMetrics.ts b/src/nni_manager/training_service/common/jobMetrics.ts index 90228ffa7d..a1abe64574 100644 --- a/src/nni_manager/training_service/common/jobMetrics.ts +++ b/src/nni_manager/training_service/common/jobMetrics.ts @@ -34,4 +34,4 @@ export class JobMetrics { this.jobStatus = jobStatus; this.endTimestamp = endTimestamp; } -} \ No newline at end of file +} diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index f347205b80..61fb3ec321 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -87,7 +87,6 @@ export class PAIJobInfoCollector { } } else { if(response.body.jobStatus && response.body.jobStatus.state) { - console.log(`*****IN getSinglePAITrialJobInfo: response body state is ${response.body.jobStatus.state}`); switch(response.body.jobStatus.state) { case 'WAITING': paiTrialJob.status = 'WAITING'; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index efaa9a7c14..c792e08c1b 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -28,23 +28,21 @@ import * as request from 'request'; import { Deferred } from 'ts-deferred'; import { EventEmitter } from 'events'; -import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { HDFSClientUtility } from './hdfsClientUtility' +import { MethodNotImplementedError } from '../../common/errors'; import { getLogger, Logger } from '../../common/log'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { - HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, - TrialJobDetail, TrialJobMetric, TrialJobStatus + JobApplicationForm, TrainingService, TrialJobApplicationForm, + TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; -import { ObservableTimer } from '../../common/observableTimer'; import { PAIJobRestServer } from './paiJobRestServer' import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT } from './paiData'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { String } from 'typescript-string-operations'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; -import { HDFSClientUtility } from './hdfsClientUtility' -import { getExperimentId } from '../../common/experimentStartupInfo'; - var WebHDFS = require('webhdfs'); @@ -199,8 +197,6 @@ class PAITrainingService implements TrainingService { // TODO: Add Virutal Cluster // PAI Task roles paiTaskRoles); - console.log(`PAI job config is ${JSON.stringify(paiJobConfig)}`); - console.log(`Before submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); // Step 2. Upload code files in codeDir onto HDFS try { @@ -223,7 +219,6 @@ class PAITrainingService implements TrainingService { } }; request(submitJobRequest, (error: Error, response: request.Response, body: any) => { - console.log(`After submission, trial job detail is ${JSON.stringify(trialJobDetail)}`); if (error || response.statusCode >= 400) { this.log.error(`PAI Training service: Submit trial ${trialJobId} to PAI Cluster failed!`); trialJobDetail.status = 'FAILED'; @@ -290,7 +285,6 @@ class PAITrainingService implements TrainingService { } this.paiToken = body.token; - console.log(`Got token ${this.paiToken} from PAI Cluster`); deferred.resolve(); } }); @@ -302,7 +296,6 @@ class PAITrainingService implements TrainingService { break; } this.paiTrialConfig = JSON.parse(value); - console.log(`Set Cluster metadata: paiTrialConfig is ${JSON.stringify(this.paiTrialConfig)}`); deferred.resolve(); break; default: diff --git a/src/nni_manager/training_service/pai/paiTrialConfig.ts b/src/nni_manager/training_service/pai/paiTrialConfig.ts deleted file mode 100644 index 583db9e725..0000000000 --- a/src/nni_manager/training_service/pai/paiTrialConfig.ts +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -'use strict'; - -import {TrialConfig} from '../common/trialConfig' - -export class PAITrialConfig extends TrialConfig{ - public readonly cpuNum: number; - public readonly memoryMB: number; - public readonly image: string; - public readonly dataDir: string; - public readonly outputDir: string; - - constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) { - super(command, codeDir, gpuNum); - this.cpuNum = cpuNum; - this.memoryMB = memoryMB; - this.image = image; - this.dataDir = dataDir; - this.outputDir = outputDir; - } -} \ No newline at end of file From 3e0cce21222f3af9d90564f21ab376f7dbf1df34 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 15:50:27 +0800 Subject: [PATCH 41/60] refactor code --- .../training_service/pai/paiData.ts | 5 ++- .../pai/paiTrainingService.ts | 37 +++++++++---------- .../trial/trial_keeper.py | 8 ++-- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index fb290e632e..7f8f8bb7c5 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -47,9 +47,10 @@ export class PAITrialJobDetail implements TrialJobDetail { export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 -&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} NNI_HDFS_HOST={3} NNI_USER_NAME={4} +&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --pai_hdfs_output_dir '{7}'`; +&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' +--pai_hdfs_host '{6}' --pai_user_name {7}`; export const PAI_OUTPUT_DIR_FORMAT: string = `hdfs://{0}:9000/{1}`; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index ea2f5c59e4..62a8ced34f 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -135,7 +135,7 @@ class PAITrainingService implements TrainingService { const trialJobId: string = uniqueString(5); //TODO: use HDFS working folder instead const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); - + const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); //create tmp trial working folder locally. await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); @@ -146,26 +146,25 @@ class PAITrainingService implements TrainingService { if(trialForm) { await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), trialForm.hyperParameters, { encoding: 'utf8' }); } - + // Step 1. Prepare PAI job configuration const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); - //get hdfs url - const hdfsURLPattern: string = 'hdfs://[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}:[0-9]{2,5}' - const hdfsHostURL = this.paiTrialConfig.outputDir.match(hdfsURLPattern) - if(hdfsHostURL === null){ - throw new Error('HDFS ouotput dir format error!'); - } - //get hdfs host - const hdfsHostPattern:string = '[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}' - const hdfsHost = this.paiTrialConfig.outputDir.match(hdfsHostPattern) - if(hdfsHost === null){ - throw new Error('HDFS ouotput dir format error!'); + const hdfsDirPattern: string = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3}):[0-9]{2,5}(?/.*)' + + const hdfsDirContent = this.paiTrialConfig.outputDir.match(hdfsDirPattern) + + if(hdfsDirContent === null){ + throw new Error('Trial outputDir format Error'); } - - //get hdfsOUtputDir - const hdfsBaseDirectory = this.paiTrialConfig.outputDir.replace(hdfsHostURL[0], "") + const groups = hdfsDirContent.groups + if(groups === undefined){ + throw new Error('Trial outputDir format Error'); + } + + const hdfsHost = groups['host'] + const hdfsBaseDirectory = groups['baseDir'] const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, @@ -182,11 +181,11 @@ class PAITrainingService implements TrainingService { `/root/${trialJobId}`, trialJobId, this.experimentId, - hdfsHost[0], - this.paiClusterConfig.userName, this.paiTrialConfig.command, getIPV4Address(), - hdfsOutputDir + hdfsOutputDir, + hdfsHost, + this.paiClusterConfig.userName ).replace(/\r\n|\n|\r/gm, ''); console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`); diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index a536097cc9..834db206ae 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -55,11 +55,9 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs local_directory = os.environ['NNI_OUTPUT_DIR'] - hdfs_host = os.environ['NNI_HDFS_HOST'] - nni_user_name = os.environ['NNI_USER_NAME'] trial_job_id = os.environ['NNI_TRIAL_JOB_ID'] exp_id = os.environ['NNI_EXP_ID'] - hdfs_client = HdfsClient(hosts='{0}:{1}'.format(hdfs_host, '50070'), user_name=nni_user_name) + hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name) print(local_directory, args.pai_hdfs_output_dir) if copyDirectoryToHdfs(local_directory, args.pai_hdfs_output_dir, hdfs_client): print('copy directory success!') @@ -80,7 +78,9 @@ def trial_keeper_help_info(*args): PARSER.set_defaults(func=trial_keeper_help_info) PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') PARSER.add_argument('--nnimanager_ip', type=str, default='localhost', help='NNI manager IP') - PARSER.add_argument('--pai_hdfs_output_dir', type=str, help='pai_hdfs_output_dir') + PARSER.add_argument('--pai_hdfs_output_dir', type=str, help='the output dir of hdfs') + PARSER.add_argument('--pai_hdfs_host', type=str, help='the host of hdfs') + PARSER.add_argument('--pai_user_name', type=str, help='the username of hdfs') args, unknown = PARSER.parse_known_args() if args.trial_command is None: exit(1) From ef1eaf85fa666e39482c01cd467c7f35f91fc7b5 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 16:21:31 +0800 Subject: [PATCH 42/60] fix comments --- .../training_service/pai/paiTrainingService.ts | 4 +--- .../trial/hdfsClientUtility.py | 4 +++- .../training_service_tool/trial/trial_keeper.py | 15 +++++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 62a8ced34f..3cc38c9d45 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -314,6 +314,7 @@ class PAITrainingService implements TrainingService { break; } this.paiTrialConfig = JSON.parse(value); + //paiTrialConfig.outputDir could be null if it is not set in nnictl if(this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null){ this.paiTrialConfig.outputDir = String.Format( PAI_OUTPUT_DIR_FORMAT, @@ -321,9 +322,6 @@ class PAITrainingService implements TrainingService { this.paiClusterConfig.userName ).replace(/\r\n|\n|\r/gm, ''); } - - console.log(`Set Cluster metadata: paiTrialConfig is ${JSON.stringify(this.paiTrialConfig)}`); - deferred.resolve(); break; default: diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py index 1bbfa258ca..0b6daeb2c4 100644 --- a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py +++ b/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py @@ -25,7 +25,7 @@ def copyDirectoryToHdfs(localDirectory, hdfsDirectory, hdfsClient): '''Copy directory from local to hdfs''' if not os.path.exists(localDirectory): raise Exception('Local Directory does not exist!') - hdfsClient.mkdirs(hdfsDirectory) + hdfsClient.mkdirs(hdfsDirectory) result = True for file in os.listdir(localDirectory): file_path = os.path.join(localDirectory, file) @@ -49,6 +49,8 @@ def copyFileToHdfs(localFilePath, hdfsFilePath, hdfsClient, override=True): '''Copy a local file to hdfs directory''' if not os.path.exists(localFilePath): raise Exception('Local file Path does not exist!') + if os.path.isdir(localFilePath): + raise Exception('localFile should not a directory!') if hdfsClient.exists(hdfsFilePath): if override: hdfsClient.delete(hdfsFilePath) diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index 834db206ae..e8ec7446dd 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -54,15 +54,18 @@ def main_loop(args): if retCode is not None: print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs - local_directory = os.environ['NNI_OUTPUT_DIR'] + nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] trial_job_id = os.environ['NNI_TRIAL_JOB_ID'] exp_id = os.environ['NNI_EXP_ID'] hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name) - print(local_directory, args.pai_hdfs_output_dir) - if copyDirectoryToHdfs(local_directory, args.pai_hdfs_output_dir, hdfs_client): - print('copy directory success!') - else: - print('copy directory failed!') + print(nni_local_output_dir, args.pai_hdfs_output_dir) + try: + if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): + print('copy directory success!') + else: + print('copy directory failed!') + except Exception as exception: + print(exception) break else: print('subprocess pid: {} is still alive'.format(process.pid)) From 7f9baeac580e27b740617e901bdc6a594016baa6 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 16:33:53 +0800 Subject: [PATCH 43/60] fix comment --- src/nni_manager/training_service/pai/paiTrainingService.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 3cc38c9d45..3ebe74f723 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -63,6 +63,7 @@ class PAITrainingService implements TrainingService { private paiToken? : string; private experimentId! : string; private readonly paiJobCollector : PAIJobInfoCollector; + private hdfsDirPattern: string; constructor() { this.log = getLogger(); @@ -72,6 +73,7 @@ class PAITrainingService implements TrainingService { this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.experimentId = getExperimentId(); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); + this.hdfsDirPattern = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3}):[0-9]{2,5}(?/.*)'; } public async run(): Promise { @@ -150,10 +152,8 @@ class PAITrainingService implements TrainingService { // Step 1. Prepare PAI job configuration const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); - - const hdfsDirPattern: string = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3}):[0-9]{2,5}(?/.*)' - const hdfsDirContent = this.paiTrialConfig.outputDir.match(hdfsDirPattern) + const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern) if(hdfsDirContent === null){ throw new Error('Trial outputDir format Error'); From 4af5c605d79970b8fc720d6bfc09cffa6d6747f2 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Wed, 26 Sep 2018 16:48:49 +0800 Subject: [PATCH 44/60] Implement cancel job API for pai training service --- .../pai/paiTrainingService.ts | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index c792e08c1b..2236690c18 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -241,10 +241,39 @@ class PAITrainingService implements TrainingService { } public cancelTrialJob(trialJobId: string): Promise { - this.log.info(`PAI Training service cancelTrialJob: jobId: ${trialJobId}`); + const trialJobDetail : PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); const deferred : Deferred = new Deferred(); + if(!trialJobDetail) { + this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`); + return Promise.reject(); + } + + if(!this.paiClusterConfig) { + throw new Error('PAI Cluster config is not initialized'); + } + if (!this.paiToken) { + throw new Error('PAI token is not initialized'); + } + + const stopJobRequest: request.Options = { + uri: `http://${this.paiClusterConfig.host}:9186/api/v1/jobs/${trialJobDetail.paiJobName}/executionType`, + method: 'PUT', + json: true, + body: {'value' : 'STOP'}, + headers: { + "Content-Type": "application/json", + "Authorization": 'Bearer ' + this.paiToken + } + }; + request(stopJobRequest, (error: Error, response: request.Response, body: any) => { + if (error || response.statusCode >= 400) { + this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`); + deferred.reject(error ? error.message : 'Stop trial failed, http code: ' + response.statusCode); + } else { + deferred.resolve(); + } + }); - deferred.resolve(); return deferred.promise; } From eb548cf734af797ff8dd9248cc1ba9c6e214ed08 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 17:31:16 +0800 Subject: [PATCH 45/60] fix default value for outputDir --- src/nni_manager/training_service/pai/paiData.ts | 2 +- .../training_service/pai/paiTrainingService.ts | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 7f8f8bb7c5..f036d3982e 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -53,4 +53,4 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = --pai_hdfs_host '{6}' --pai_user_name {7}`; export const PAI_OUTPUT_DIR_FORMAT: string = -`hdfs://{0}:9000/{1}`; +`hdfs://{0}:9000/`; diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 3ebe74f723..1a12370b02 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -73,7 +73,7 @@ class PAITrainingService implements TrainingService { this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.experimentId = getExperimentId(); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); - this.hdfsDirPattern = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3}):[0-9]{2,5}(?/.*)'; + this.hdfsDirPattern = 'hdfs://(?([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?/.*)?'; } public async run(): Promise { @@ -164,7 +164,10 @@ class PAITrainingService implements TrainingService { } const hdfsHost = groups['host'] - const hdfsBaseDirectory = groups['baseDir'] + let hdfsBaseDirectory = groups['baseDir'] + if(hdfsBaseDirectory === undefined){ + hdfsBaseDirectory = "/"; + } const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, @@ -318,8 +321,7 @@ class PAITrainingService implements TrainingService { if(this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null){ this.paiTrialConfig.outputDir = String.Format( PAI_OUTPUT_DIR_FORMAT, - this.paiClusterConfig.host, - this.paiClusterConfig.userName + this.paiClusterConfig.host ).replace(/\r\n|\n|\r/gm, ''); } deferred.resolve(); From 4d24e87fa0b7da2142562f715cb083c94f7dc0b3 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 18:02:52 +0800 Subject: [PATCH 46/60] fix comments --- src/nni_manager/training_service/pai/paiData.ts | 2 +- src/nni_manager/training_service/pai/paiTrainingService.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index f036d3982e..1f19e948ec 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -46,7 +46,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai-t-shya2 +`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 1a12370b02..2d5cecdb39 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -63,7 +63,7 @@ class PAITrainingService implements TrainingService { private paiToken? : string; private experimentId! : string; private readonly paiJobCollector : PAIJobInfoCollector; - private hdfsDirPattern: string; + private readonly hdfsDirPattern: string; constructor() { this.log = getLogger(); From b714a8f1dcbc0147532435d2991b7ce93e36bbfd Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Wed, 26 Sep 2018 19:30:21 +0800 Subject: [PATCH 47/60] fix pip install to master --- src/nni_manager/training_service/pai/paiData.ts | 2 +- src/nni_manager/training_service_tool/trial/trial_keeper.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 1f19e948ec..aebe31ab94 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -46,7 +46,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai +`pip3 install -v --user git+https://github.com/yds05/nni.git@master && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/src/nni_manager/training_service_tool/trial/trial_keeper.py index e8ec7446dd..0b0c7b7689 100644 --- a/src/nni_manager/training_service_tool/trial/trial_keeper.py +++ b/src/nni_manager/training_service_tool/trial/trial_keeper.py @@ -55,8 +55,6 @@ def main_loop(args): print('subprocess terminated. Exit code is {}. Quit'.format(retCode)) #copy local directory to hdfs nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] - trial_job_id = os.environ['NNI_TRIAL_JOB_ID'] - exp_id = os.environ['NNI_EXP_ID'] hdfs_client = HdfsClient(hosts='{0}:{1}'.format(args.pai_hdfs_host, '50070'), user_name=args.pai_user_name) print(nni_local_output_dir, args.pai_hdfs_output_dir) try: From b6a233a1001a3bcc8adeb9fe4cd3a99176d35472 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Sep 2018 10:12:21 +0800 Subject: [PATCH 48/60] change pip install branch in paiData.ts --- src/nni_manager/training_service/pai/paiData.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index e05c9df3ec..a3e5019766 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -46,7 +46,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`pip3 install -v --user git+https://github.com/yds05/nni.git@dev-pai +`pip3 install -v --user git+https://github.com/yds05/nni.git@master && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}'`; \ No newline at end of file From 52b1cc82a3ec254c9664f1f162dcc416714ddbb4 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Sep 2018 10:36:16 +0800 Subject: [PATCH 49/60] fix log path --- src/nni_manager/training_service/pai/paiData.ts | 3 ++- src/nni_manager/training_service/pai/paiTrainingService.ts | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index aebe31ab94..cc3d896af8 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -34,7 +34,7 @@ export class PAITrialJobDetail implements TrialJobDetail { public form: JobApplicationForm; constructor(id: string, status: TrialJobStatus, paiJobName : string, - submitTime: number, workingDirectory: string, form: JobApplicationForm) { + submitTime: number, workingDirectory: string, form: JobApplicationForm, url: string) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -42,6 +42,7 @@ export class PAITrialJobDetail implements TrialJobDetail { this.workingDirectory = workingDirectory; this.form = form; this.tags = []; + this.url = url; } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 1334399b09..b56a483e5c 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -175,7 +175,8 @@ class PAITrainingService implements TrainingService { paiJobName, Date.now(), trialWorkingFolder, - form); + form, + this.paiTrialConfig.outputDir); this.trialJobsMap.set(trialJobId, trialJobDetail); const nniPaiTrialCommand : string = String.Format( From c27d146d3ab9e791221f5203ef377c731d00776a Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Sep 2018 11:18:57 +0800 Subject: [PATCH 50/60] add logpath logic --- src/nni_manager/training_service/pai/paiData.ts | 3 +++ .../training_service/pai/paiTrainingService.ts | 9 +++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 268a9bde27..73321ba222 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -55,3 +55,6 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = export const PAI_OUTPUT_DIR_FORMAT: string = `hdfs://{0}:9000/`; + +export const PAI_LOG_PATH_FORMAT: string = +`http://{0}:50070/explorer.html#{1}` diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index b56a483e5c..41eed860c5 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -39,7 +39,7 @@ import { } from '../../common/trainingService'; import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { PAIJobRestServer } from './paiJobRestServer' -import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT } from './paiData'; +import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData'; import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { String } from 'typescript-string-operations'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; @@ -169,6 +169,11 @@ class PAITrainingService implements TrainingService { hdfsBaseDirectory = "/"; } const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) + const logPath: string = String.Format( + PAI_LOG_PATH_FORMAT, + hdfsHost, + hdfsOutputDir + ) const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', @@ -176,7 +181,7 @@ class PAITrainingService implements TrainingService { Date.now(), trialWorkingFolder, form, - this.paiTrialConfig.outputDir); + logPath); this.trialJobsMap.set(trialJobId, trialJobDetail); const nniPaiTrialCommand : string = String.Format( From 449a4f3c8366eb02d440d101e592b16258f8167f Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Sep 2018 12:00:05 +0800 Subject: [PATCH 51/60] add log path --- src/nni_manager/training_service/pai/paiData.ts | 3 +-- .../training_service/pai/paiJobInfoCollector.ts | 15 ++++++++++++--- .../training_service/pai/paiTrainingService.ts | 8 ++++---- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 73321ba222..920ef0b53c 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -34,7 +34,7 @@ export class PAITrialJobDetail implements TrialJobDetail { public form: JobApplicationForm; constructor(id: string, status: TrialJobStatus, paiJobName : string, - submitTime: number, workingDirectory: string, form: JobApplicationForm, url: string) { + submitTime: number, workingDirectory: string, form: JobApplicationForm) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -42,7 +42,6 @@ export class PAITrialJobDetail implements TrialJobDetail { this.workingDirectory = workingDirectory; this.form = form; this.tags = []; - this.url = url; } } diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 61fb3ec321..91cd6e4e74 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -43,7 +43,7 @@ export class PAIJobInfoCollector { this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED']; } - public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise { + public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig, paiLogPath?: string) : Promise { if (!paiClusterConfig || !paiToken) { return Promise.resolve(); } @@ -53,13 +53,13 @@ export class PAIJobInfoCollector { if (!paiTrialJob) { throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); } - updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig)) + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig, paiLogPath)) } await Promise.all(updatePaiTrialJobs); } - private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig) : Promise { + private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig, paiLogPath?: string) : Promise { const deferred : Deferred = new Deferred(); if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { deferred.resolve(); @@ -101,12 +101,21 @@ export class PAIJobInfoCollector { } break; case 'SUCCEEDED': + if(paiLogPath !== undefined){ + paiTrialJob.url = paiLogPath; + } paiTrialJob.status = 'SUCCEEDED'; break; case 'STOPPED': + if(paiLogPath !== undefined){ + paiTrialJob.url = paiLogPath; + } paiTrialJob.status = 'USER_CANCELED'; break; case 'FAILED': + if(paiLogPath !== undefined){ + paiTrialJob.url = paiLogPath; + } paiTrialJob.status = 'FAILED'; break; default: diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 41eed860c5..3249ff8538 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -64,6 +64,7 @@ class PAITrainingService implements TrainingService { private experimentId! : string; private readonly paiJobCollector : PAIJobInfoCollector; private readonly hdfsDirPattern: string; + private logPath: string | undefined; constructor() { this.log = getLogger(); @@ -81,7 +82,7 @@ class PAITrainingService implements TrainingService { await restServer.start(); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); while (!this.stopping) { - await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig); + await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig, this.logPath); await delay(3000); } } @@ -169,7 +170,7 @@ class PAITrainingService implements TrainingService { hdfsBaseDirectory = "/"; } const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) - const logPath: string = String.Format( + this.logPath = String.Format( PAI_LOG_PATH_FORMAT, hdfsHost, hdfsOutputDir @@ -180,8 +181,7 @@ class PAITrainingService implements TrainingService { paiJobName, Date.now(), trialWorkingFolder, - form, - logPath); + form); this.trialJobsMap.set(trialJobId, trialJobDetail); const nniPaiTrialCommand : string = String.Format( From 1d9f23e6ef39574c91bf24364618a8ffc9786b65 Mon Sep 17 00:00:00 2001 From: "Shinai Yang (FA TALENT)" Date: Thu, 27 Sep 2018 14:43:43 +0800 Subject: [PATCH 52/60] refactor schema --- tools/nnicmd/config_schema.py | 62 +++++++++++++++++++++++----------- tools/nnicmd/launcher_utils.py | 15 ++++++-- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/tools/nnicmd/config_schema.py b/tools/nnicmd/config_schema.py index f6298bff89..83b476d0c2 100644 --- a/tools/nnicmd/config_schema.py +++ b/tools/nnicmd/config_schema.py @@ -21,7 +21,7 @@ import os from schema import Schema, And, Use, Optional, Regex, Or -CONFIG_SCHEMA = Schema({ +common_schema = { 'authorName': str, 'experimentName': str, 'trialConcurrency': And(int, lambda n: 1 <=n <= 999999), @@ -44,16 +44,6 @@ Optional('classArgs'): dict, Optional('gpuNum'): And(int, lambda x: 0 <= x <= 99999), }), -'trial':{ - 'command': str, - 'codeDir': os.path.exists, - 'gpuNum': And(int, lambda x: 0 <= x <= 99999), - Optional('cpuNum'): And(int, lambda x: 0 <= x <= 99999), - Optional('memoryMB'): int, - Optional('image'): str, - Optional('dataDir'): str, - Optional('outputDir'): str - }, Optional('assessor'): Or({ 'builtinAssessorName': lambda x: x in ['Medianstop'], 'classArgs': { @@ -63,10 +53,41 @@ 'codeDir': os.path.exists, 'classFileName': str, 'className': str, - 'classArgs': { - 'optimize_mode': lambda x: x in ['maximize', 'minimize']}, - 'gpuNum': And(int, lambda x: 0 <= x <= 99999), + Optional('classArgs'): dict, + Optional('gpuNum'): And(int, lambda x: 0 <= x <= 99999), }), +} + +common_trial_schema = { +'trial':{ + 'command': str, + 'codeDir': os.path.exists, + 'gpuNum': And(int, lambda x: 0 <= x <= 99999) + } +} + +pai_trial_schema = { +'trial':{ + 'command': str, + 'codeDir': os.path.exists, + 'gpuNum': And(int, lambda x: 0 <= x <= 99999), + 'cpuNum': And(int, lambda x: 0 <= x <= 99999), + 'memoryMB': int, + 'image': str, + 'dataDir': Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'), + 'outputDir': Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?') + } +} + +pai_config_schema = { +'paiConfig':{ + 'userName': str, + 'passWord': str, + 'host': str +} +} + +machine_list_schima = { Optional('machineList'):[Or({ 'ip': str, 'port': And(int, lambda x: 0 < x < 65535), @@ -78,10 +99,11 @@ 'username': str, 'sshKeyPath': os.path.exists, Optional('passphrase'): str -})], -Optional('paiConfig'):{ - 'userName': str, - 'passWord': str, - 'host': str +})] } -}) + +LOCAL_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema}) + +REMOTE_CONFIG_SCHEMA = Schema({**common_schema, **common_trial_schema, **machine_list_schima}) + +PAI_CONFIG_SCHEMA = Schema({**common_schema, **pai_trial_schema, **pai_config_schema}) \ No newline at end of file diff --git a/tools/nnicmd/launcher_utils.py b/tools/nnicmd/launcher_utils.py index 384e3a6eb7..30c9cea13e 100644 --- a/tools/nnicmd/launcher_utils.py +++ b/tools/nnicmd/launcher_utils.py @@ -20,8 +20,8 @@ import os import json -from .config_schema import CONFIG_SCHEMA -from .common_utils import get_json_content +from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA +from .common_utils import get_json_content, print_error def expand_path(experiment_config, key): '''Change '~' to user home directory''' @@ -81,8 +81,17 @@ def validate_search_space_content(experiment_config): def validate_common_content(experiment_config): '''Validate whether the common values in experiment_config is valid''' + if not experiment_config.get('trainingServicePlatform') or \ + experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai']: + print_error('Please set correct trainingServicePlatform!') + exit(0) + schema_dict = { + 'local': LOCAL_CONFIG_SCHEMA, + 'remote': REMOTE_CONFIG_SCHEMA, + 'pai': PAI_CONFIG_SCHEMA + } try: - CONFIG_SCHEMA.validate(experiment_config) + schema_dict.get(experiment_config['trainingServicePlatform']).validate(experiment_config) #set default value if experiment_config.get('maxExecDuration') is None: experiment_config['maxExecDuration'] = '999d' From aa552c07801dacf1544e647bdf27c2de7960fc7e Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 14:44:24 +0800 Subject: [PATCH 53/60] Fix bug that all trials use the same hdfs log path --- .../training_service/pai/paiData.ts | 4 ++- .../pai/paiJobInfoCollector.ts | 20 ++++--------- .../pai/paiTrainingService.ts | 28 +++++++++---------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 920ef0b53c..2594953251 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -32,9 +32,10 @@ export class PAITrialJobDetail implements TrialJobDetail { public url?: string; public workingDirectory: string; public form: JobApplicationForm; + public hdfsLogPath: string; constructor(id: string, status: TrialJobStatus, paiJobName : string, - submitTime: number, workingDirectory: string, form: JobApplicationForm) { + submitTime: number, workingDirectory: string, form: JobApplicationForm, hdfsLogPath: string) { this.id = id; this.status = status; this.paiJobName = paiJobName; @@ -42,6 +43,7 @@ export class PAITrialJobDetail implements TrialJobDetail { this.workingDirectory = workingDirectory; this.form = form; this.tags = []; + this.hdfsLogPath = hdfsLogPath; } } diff --git a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts index 91cd6e4e74..041151c47d 100644 --- a/src/nni_manager/training_service/pai/paiJobInfoCollector.ts +++ b/src/nni_manager/training_service/pai/paiJobInfoCollector.ts @@ -43,7 +43,7 @@ export class PAIJobInfoCollector { this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED']; } - public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig, paiLogPath?: string) : Promise { + public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise { if (!paiClusterConfig || !paiToken) { return Promise.resolve(); } @@ -53,13 +53,13 @@ export class PAIJobInfoCollector { if (!paiTrialJob) { throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); } - updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig, paiLogPath)) + updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig)) } await Promise.all(updatePaiTrialJobs); } - private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig, paiLogPath?: string) : Promise { + private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig) : Promise { const deferred : Deferred = new Deferred(); if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) { deferred.resolve(); @@ -101,21 +101,12 @@ export class PAIJobInfoCollector { } break; case 'SUCCEEDED': - if(paiLogPath !== undefined){ - paiTrialJob.url = paiLogPath; - } paiTrialJob.status = 'SUCCEEDED'; break; case 'STOPPED': - if(paiLogPath !== undefined){ - paiTrialJob.url = paiLogPath; - } paiTrialJob.status = 'USER_CANCELED'; break; case 'FAILED': - if(paiLogPath !== undefined){ - paiTrialJob.url = paiLogPath; - } paiTrialJob.status = 'FAILED'; break; default: @@ -130,8 +121,9 @@ export class PAIJobInfoCollector { if(!paiTrialJob.endTime) { paiTrialJob.endTime = response.body.jobStatus.completedTime; } - if(!paiTrialJob.url) { - paiTrialJob.url = response.body.jobStatus.appTrackingUrl; + // Set pai trial job's url to WebHDFS output path + if(paiTrialJob.hdfsLogPath) { + paiTrialJob.url = paiTrialJob.hdfsLogPath; } } } diff --git a/src/nni_manager/training_service/pai/paiTrainingService.ts b/src/nni_manager/training_service/pai/paiTrainingService.ts index 3249ff8538..8ed0e306f7 100644 --- a/src/nni_manager/training_service/pai/paiTrainingService.ts +++ b/src/nni_manager/training_service/pai/paiTrainingService.ts @@ -64,7 +64,6 @@ class PAITrainingService implements TrainingService { private experimentId! : string; private readonly paiJobCollector : PAIJobInfoCollector; private readonly hdfsDirPattern: string; - private logPath: string | undefined; constructor() { this.log = getLogger(); @@ -82,7 +81,7 @@ class PAITrainingService implements TrainingService { await restServer.start(); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); while (!this.stopping) { - await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig, this.logPath); + await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig); await delay(3000); } } @@ -154,34 +153,35 @@ class PAITrainingService implements TrainingService { const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId); - const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern) + const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern); - if(hdfsDirContent === null){ + if(hdfsDirContent === null) { throw new Error('Trial outputDir format Error'); } - const groups = hdfsDirContent.groups - if(groups === undefined){ + const groups = hdfsDirContent.groups; + if(groups === undefined) { throw new Error('Trial outputDir format Error'); } - const hdfsHost = groups['host'] - let hdfsBaseDirectory = groups['baseDir'] - if(hdfsBaseDirectory === undefined){ + const hdfsHost = groups['host']; + let hdfsBaseDirectory = groups['baseDir']; + if(hdfsBaseDirectory === undefined) { hdfsBaseDirectory = "/"; } - const hdfsOutputDir = path.join(hdfsBaseDirectory, this.experimentId, trialJobId) - this.logPath = String.Format( + const hdfsOutputDir : string = path.join(hdfsBaseDirectory, this.experimentId, trialJobId); + const hdfsLogPath : string = String.Format( PAI_LOG_PATH_FORMAT, hdfsHost, - hdfsOutputDir - ) + hdfsOutputDir); + const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( trialJobId, 'WAITING', paiJobName, Date.now(), trialWorkingFolder, - form); + form, + hdfsLogPath); this.trialJobsMap.set(trialJobId, trialJobDetail); const nniPaiTrialCommand : string = String.Format( From cb46266cbb69c48e66ba5de2b1c66bf6385d1010 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 16:19:36 +0800 Subject: [PATCH 54/60] Update PAI training service PR comments --- Makefile | 17 +++++++-------- setup.py | 2 +- .../rest_server/restValidationSchemas.ts | 2 +- .../training_service/pai/paiData.ts | 2 +- .../training_service_tool/setup.py | 21 ------------------- src/sdk/pynni/setup.py | 2 +- tools/setup.py | 7 ++++--- .../trial => tools/trial_tool}/__init__.py | 0 .../trial => tools/trial_tool}/constants.py | 0 .../trial_tool}/hdfsClientUtility.py | 0 .../trial_tool}/metrics_reader.py | 0 .../trial => tools/trial_tool}/rest_utils.py | 0 .../test/test_hdfsClientUtility.py | 0 .../trial_tool}/trial_keeper.py | 0 .../trial => tools/trial_tool}/url_utils.py | 0 15 files changed, 15 insertions(+), 38 deletions(-) delete mode 100644 src/nni_manager/training_service_tool/setup.py rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/__init__.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/constants.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/hdfsClientUtility.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/metrics_reader.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/rest_utils.py (100%) rename {src/nni_manager/training_service_tool => tools/trial_tool}/test/test_hdfsClientUtility.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/trial_keeper.py (100%) rename {src/nni_manager/training_service_tool/trial => tools/trial_tool}/url_utils.py (100%) diff --git a/Makefile b/Makefile index 06bb89a0e9..dc71a9dc1e 100644 --- a/Makefile +++ b/Makefile @@ -92,9 +92,13 @@ build: #$(_INFO) Building nnictl $(_END) cd tools && python3 setup.py build - #$(_INFO) Building Training Service tool $(_END) - cd src/nni_manager/training_service_tool && python3 setup.py build - +# Standard installation target +# Must be invoked after building +.PHONY: install +install: install-python-modules +install: install-node-modules +install: install-scripts +install: install-examples install: #$(_INFO) Complete! You may want to add $(BIN_PATH) to your PATH environment $(_END) @@ -104,7 +108,6 @@ install: .PHONY: remote-machine-install remote-machine-install: cd src/sdk/pynni && python3 setup.py install $(PIP_MODE) - cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) # All-in-one target for non-expert users @@ -204,9 +207,6 @@ install-python-modules: #$(_INFO) Installing nnictl $(_END) cd tools && python3 setup.py install $(PIP_MODE) - #$(_INFO) Installing NNI training service tool $(_END) - cd src/nni_manager/training_service_tool && python3 setup.py install $(PIP_MODE) - .PHONY: install-node-modules install-node-modules: mkdir -p $(INSTALL_PREFIX)/nni @@ -227,9 +227,6 @@ install-dev-modules: #$(_INFO) Installing nnictl $(_END) cd tools && $(PIP_INSTALL) $(PIP_MODE) -e . - #$(_INFO) Installing NNI training service tool $(_END) - cd src/nni_manager/training_service_tool && $(PIP_INSTALL) $(PIP_MODE) -e . - mkdir -p $(INSTALL_PREFIX)/nni #$(_INFO) Installing NNI Manager $(_END) diff --git a/setup.py b/setup.py index 8fc15639a8..c07b1f782d 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ def run(self): 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', 'nnicmd': 'tools/nnicmd', - 'trial': 'src/nni_manager/training_service_tool/trial' + 'trial_tool':'tools/trial_tool' }, python_requires = '>=3.5', install_requires = [ diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 500c26aa36..24b07836a7 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -33,7 +33,7 @@ export namespace ValidationSchemas { passphrase: joi.string() })), trial_config: joi.object({ - image: joi.string().min(1), + image: joi.string().min(1), codeDir: joi.string().min(1).required(), dataDir: joi.string(), outputDir: joi.string(), diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 2594953251..338e859553 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -51,7 +51,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = `pip3 install -v --user git+https://github.com/yds05/nni.git@master && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni -&& python3 -m trial.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' +&& python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' --pai_hdfs_host '{6}' --pai_user_name {7}`; export const PAI_OUTPUT_DIR_FORMAT: string = diff --git a/src/nni_manager/training_service_tool/setup.py b/src/nni_manager/training_service_tool/setup.py deleted file mode 100644 index bd22724571..0000000000 --- a/src/nni_manager/training_service_tool/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -import setuptools - -setuptools.setup( - # NNI Training Service(nnits) package - name = 'nnits-tool', - version = '0.0.1', - packages = setuptools.find_packages(), - - python_requires = '>=3.5', - install_requires = [ - 'requests', - 'psutil', - 'pyhdfs' - ], - - author = 'Microsoft NNI Team', - author_email = 'nni@microsoft.com', - description = 'NNI Training Service Tool for Neural Network Intelligence project', - license = 'MIT', - url = 'https://github.com/Microsoft/nni' -) \ No newline at end of file diff --git a/src/sdk/pynni/setup.py b/src/sdk/pynni/setup.py index fee463e371..71b8a675a1 100644 --- a/src/sdk/pynni/setup.py +++ b/src/sdk/pynni/setup.py @@ -44,7 +44,7 @@ def read(fname): author_email = 'nni@microsoft.com', description = 'Python SDK for Neural Network Intelligence project', license = 'MIT', - url = 'https://msrasrg.visualstudio.com/NeuralNetworkIntelligence', + url = 'https://github.com/Microsoft/nni', long_description = read('README.md') ) diff --git a/tools/setup.py b/tools/setup.py index d789c265e6..2e5be68cb2 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -3,7 +3,7 @@ setuptools.setup( name = 'nnictl', version = '0.0.1', - packages = setuptools.find_packages(), + packages = setuptools.find_packages(exclude=['*test*']), python_requires = '>=3.5', install_requires = [ @@ -11,12 +11,13 @@ 'pyyaml', 'psutil', 'astor', - 'schema' + 'schema', + 'pyhdfs' ], author = 'Microsoft NNI Team', author_email = 'nni@microsoft.com', description = 'NNI control for Neural Network Intelligence project', license = 'MIT', - url = 'https://msrasrg.visualstudio.com/NeuralNetworkIntelligence', + url = 'https://github.com/Microsoft/nni', ) diff --git a/src/nni_manager/training_service_tool/trial/__init__.py b/tools/trial_tool/__init__.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/__init__.py rename to tools/trial_tool/__init__.py diff --git a/src/nni_manager/training_service_tool/trial/constants.py b/tools/trial_tool/constants.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/constants.py rename to tools/trial_tool/constants.py diff --git a/src/nni_manager/training_service_tool/trial/hdfsClientUtility.py b/tools/trial_tool/hdfsClientUtility.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/hdfsClientUtility.py rename to tools/trial_tool/hdfsClientUtility.py diff --git a/src/nni_manager/training_service_tool/trial/metrics_reader.py b/tools/trial_tool/metrics_reader.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/metrics_reader.py rename to tools/trial_tool/metrics_reader.py diff --git a/src/nni_manager/training_service_tool/trial/rest_utils.py b/tools/trial_tool/rest_utils.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/rest_utils.py rename to tools/trial_tool/rest_utils.py diff --git a/src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py b/tools/trial_tool/test/test_hdfsClientUtility.py similarity index 100% rename from src/nni_manager/training_service_tool/test/test_hdfsClientUtility.py rename to tools/trial_tool/test/test_hdfsClientUtility.py diff --git a/src/nni_manager/training_service_tool/trial/trial_keeper.py b/tools/trial_tool/trial_keeper.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/trial_keeper.py rename to tools/trial_tool/trial_keeper.py diff --git a/src/nni_manager/training_service_tool/trial/url_utils.py b/tools/trial_tool/url_utils.py similarity index 100% rename from src/nni_manager/training_service_tool/trial/url_utils.py rename to tools/trial_tool/url_utils.py From f09a65157d92bec49c39fab63c3ae7cde892096f Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 16:51:51 +0800 Subject: [PATCH 55/60] Remove unused nnits-tool in uninstallation --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index dc71a9dc1e..4d7d0a11b3 100644 --- a/Makefile +++ b/Makefile @@ -148,7 +148,6 @@ dev-install: uninstall: -$(PIP_UNINSTALL) -y nni -$(PIP_UNINSTALL) -y nnictl - -$(PIP_UNINSTALL) -y nnits-tool -rm -rf $(INSTALL_PREFIX)/nni -rm -f $(BIN_PATH)/nnimanager -rm -f $(BIN_PATH)/nnictl From 94c92c34b156aacd9317f2c009d15b73d5b16725 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 16:53:22 +0800 Subject: [PATCH 56/60] Remove unused trianing_service_tool package in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c07b1f782d..684fd5e6e8 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ def run(self): license = 'MIT', url = 'https://github.com/Microsoft/nni', - packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools') + find_packages('src/nni_manager/training_service_tool'), + packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools'), package_dir = { 'nni_annotation': 'tools/nni_annotation', 'nni': 'src/sdk/pynni/nni', From 2eca5d970b0ae05f456eb706b44e7ccae03c3371 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 16:59:09 +0800 Subject: [PATCH 57/60] Update setup.py version to 0.2.0 --- setup.py | 2 +- src/sdk/pynni/setup.py | 2 +- tools/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 684fd5e6e8..bfdda8a283 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ def run(self): setup( name = 'NNI', - version = '0.1.0', + version = '0.2.0', author = 'Microsoft NNI Team', author_email = 'nni@microsoft.com', description = 'Neural Network Intelligence project', diff --git a/src/sdk/pynni/setup.py b/src/sdk/pynni/setup.py index 71b8a675a1..fae0ceac41 100644 --- a/src/sdk/pynni/setup.py +++ b/src/sdk/pynni/setup.py @@ -27,7 +27,7 @@ def read(fname): setuptools.setup( name = 'nni', - version = '0.0.1', + version = '0.2.0', packages = setuptools.find_packages(exclude=['tests']), python_requires = '>=3.5', diff --git a/tools/setup.py b/tools/setup.py index 2e5be68cb2..7b368f4267 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -2,7 +2,7 @@ setuptools.setup( name = 'nnictl', - version = '0.0.1', + version = '0.2.0', packages = setuptools.find_packages(exclude=['*test*']), python_requires = '>=3.5', From 717856e2bceea943953f9b2eb1a9b5364ae18c6a Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 17:07:27 +0800 Subject: [PATCH 58/60] Change pip install repo to Microsoft/nni --- src/nni_manager/training_service/pai/paiData.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/pai/paiData.ts b/src/nni_manager/training_service/pai/paiData.ts index 338e859553..4b954deb4e 100644 --- a/src/nni_manager/training_service/pai/paiData.ts +++ b/src/nni_manager/training_service/pai/paiData.ts @@ -48,7 +48,7 @@ export class PAITrialJobDetail implements TrialJobDetail { } export const PAI_TRIAL_COMMAND_FORMAT: string = -`pip3 install -v --user git+https://github.com/yds05/nni.git@master +`pip3 install -v --user git+https://github.com/Microsoft/nni.git@master && export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2} && cd $NNI_SYS_DIR && mkdir .nni && python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}' From c32cd52f616e8e48858539f064a4305fbd52d375 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 18:21:55 +0800 Subject: [PATCH 59/60] Update NNI v0.2 release notes --- docs/RELEASE.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/RELEASE.md b/docs/RELEASE.md index 745566a96c..7bc5ef62c0 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -1,3 +1,18 @@ +# Release 0.2.0 - 9/29/2018 +## Major Features + * Support for [OpenPAI](https://github.com/Microsoft/pai) (aka pai) Training Service + * Support training services on pai mode. NNI trials will be scheduled to run on OpenPAI cluster + * NNI trial's output (including logs and model file) will be copied to OpenPAI HDFS for further debugging and checking + * Support [SMAC](https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf) tuner + * [SMAC](https://www.cs.ubc.ca/~hutter/papers/10-TR-SMAC.pdf) is based on Sequential Model-Based Optimization (SMBO). It adapts the most prominent previously used model class (Gaussian stochastic process models) and introduces the model class of random forests to SMBO to handle categorical parameters. The SMAC supported by NNI is a wrapper on [SMAC3](https://github.com/automl/SMAC3) + * Support NNI installation on [conda](https://conda.io/docs/index.html) and python virtual environment + * Others + * Update ga squad example and related documentation + * WebUI UX small enhancement and bug fixsss + +## Known Issues +[Known Issues in release 0.2.0](https://github.com/Microsoft/nni/labels/nni020knownissues). + # Release 0.1.0 - 9/10/2018 (initial release) Initial release of Neural Network Intelligence (NNI). From 76c10e8f4aac76d549d1e8e2fe6fdbd9b9246fa0 Mon Sep 17 00:00:00 2001 From: Deshui Yu Date: Thu, 27 Sep 2018 18:57:41 +0800 Subject: [PATCH 60/60] Fix typo based on PR comments --- docs/RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/RELEASE.md b/docs/RELEASE.md index 7bc5ef62c0..e7ef20c729 100644 --- a/docs/RELEASE.md +++ b/docs/RELEASE.md @@ -8,7 +8,7 @@ * Support NNI installation on [conda](https://conda.io/docs/index.html) and python virtual environment * Others * Update ga squad example and related documentation - * WebUI UX small enhancement and bug fixsss + * WebUI UX small enhancement and bug fix ## Known Issues [Known Issues in release 0.2.0](https://github.com/Microsoft/nni/labels/nni020knownissues).