Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

PAI Training Service implementation #128

Merged
merged 71 commits into from
Sep 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
9a8ac16
PAI Training service implementation, v1
Sep 19, 2018
8983045
update trial package directory in setup.py
Sep 19, 2018
248d0eb
Update setup.py package info
Sep 19, 2018
43fca76
Update trial keeper module, use IP adress for pai training service ma…
Sep 20, 2018
4fe49de
Update metrics file path in reader
Sep 20, 2018
66a54e1
Fix metrics file path issue
Sep 21, 2018
65709d3
Update pai integration, full implementation of pai training service
Sep 21, 2018
c1a3d34
Do not send metrics if it is empty
Sep 21, 2018
232d0e8
Update nnictl, to support pai configuration
Sep 21, 2018
a5d4a20
fix repo
Sep 24, 2018
cd64e5f
add hdfs_output_dir
Sep 24, 2018
889e066
add copy logic
Sep 24, 2018
de9c374
debug
Sep 24, 2018
e98b0ac
update hdfsUtility
Sep 24, 2018
272411a
debug
Sep 24, 2018
4cba4d1
debug
Sep 24, 2018
45d1031
fix setup.py bug
Sep 24, 2018
e63ffc0
fix bug
Sep 24, 2018
954d640
debug
Sep 24, 2018
0410d05
debug
Sep 24, 2018
e3788d2
add exception handler
Sep 25, 2018
793cbf1
fix bug
Sep 25, 2018
b14c108
debug
Sep 25, 2018
0ae9f6d
fix bug
Sep 25, 2018
5938310
fix bug
Sep 25, 2018
c756188
fix bug
Sep 25, 2018
b6ce813
split metrics into single line, and read metrics no matter if subproc…
Sep 25, 2018
dc0f96b
add unit test for hdfsClientUtility
Sep 25, 2018
55b6e08
fix bug
Sep 25, 2018
2529376
Add experiment id in update metrics url to differ trials
Sep 25, 2018
0f7d40c
add default outputdir
Sep 25, 2018
43d7ab7
update
Sep 25, 2018
9c53f47
fix trial_keeper
Sep 25, 2018
beac29c
fix bug
Sep 25, 2018
c54ad7a
add default value for nnioutputdir
Sep 25, 2018
c214362
fix bug
Sep 25, 2018
60bf770
remove unused code
Sep 25, 2018
fad2ba3
PAI Training service implementation, v1 (#1)
yds05 Sep 25, 2018
7f06762
fix conflict
Sep 25, 2018
aa4f306
fix conflict
Sep 25, 2018
45c9600
fix conflict
Sep 25, 2018
24dd1b6
Remove unused import and paiTrialConfig file
Sep 25, 2018
84d278c
Merge branch 'master' into dev-pai
yds05 Sep 25, 2018
9febbd3
Merge pull request #3 from yds05/dev-pai
yds05 Sep 25, 2018
7a43c54
fix conflict
Sep 25, 2018
3e0cce2
refactor code
Sep 26, 2018
ef1eaf8
fix comments
Sep 26, 2018
7f9baea
fix comment
Sep 26, 2018
4af5c60
Implement cancel job API for pai training service
Sep 26, 2018
eb548cf
fix default value for outputDir
Sep 26, 2018
4d24e87
fix comments
Sep 26, 2018
6325bd3
Merge pull request #4 from yds05/dev-pai-desy
yds05 Sep 26, 2018
1db913c
Merge pull request #2 from yds05/dev-pai-t-shya2
SparkSnail Sep 26, 2018
5487975
Merge branch 'master' of https://github.com/Microsoft/nni into Micros…
Sep 26, 2018
88b1876
Merge branch 'Microsoft-master'
Sep 26, 2018
90c9e69
Merge pull request #6 from yds05/master
SparkSnail Sep 26, 2018
b714a8f
fix pip install to master
Sep 26, 2018
b6a233a
change pip install branch in paiData.ts
Sep 27, 2018
9511174
fix conflict
Sep 27, 2018
52b1cc8
fix log path
Sep 27, 2018
76bd378
fix conflict
Sep 27, 2018
c27d146
add logpath logic
Sep 27, 2018
449a4f3
add log path
Sep 27, 2018
1d9f23e
refactor schema
Sep 27, 2018
aa552c0
Fix bug that all trials use the same hdfs log path
Sep 27, 2018
9edfb34
Merge pull request #7 from yds05/dev-pai-t-shya2
yds05 Sep 27, 2018
cb46266
Update PAI training service PR comments
Sep 27, 2018
f09a651
Remove unused nnits-tool in uninstallation
Sep 27, 2018
94c92c3
Remove unused trianing_service_tool package in setup.py
Sep 27, 2018
2eca5d9
Update setup.py version to 0.2.0
Sep 27, 2018
717856e
Change pip install repo to Microsoft/nni
Sep 27, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ build:
#$(_INFO) Building nnictl $(_END)
cd tools && python3 setup.py build


# Standard installation target
# Must be invoked after building
.PHONY: install
Expand Down Expand Up @@ -207,7 +206,6 @@ install-python-modules:
#$(_INFO) Installing nnictl $(_END)
cd tools && python3 setup.py install $(PIP_MODE)


.PHONY: install-node-modules
install-node-modules:
mkdir -p $(INSTALL_PREFIX)/nni
Expand All @@ -227,7 +225,7 @@ install-dev-modules:

#$(_INFO) Installing nnictl $(_END)
cd tools && $(PIP_INSTALL) $(PIP_MODE) -e .

mkdir -p $(INSTALL_PREFIX)/nni

#$(_INFO) Installing NNI Manager $(_END)
Expand Down
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def run(self):

setup(
name = 'NNI',
version = '0.1.0',
version = '0.2.0',
author = 'Microsoft NNI Team',
author_email = 'nni@microsoft.com',
description = 'Neural Network Intelligence project',
Expand All @@ -47,7 +47,8 @@ def run(self):
package_dir = {
'nni_annotation': 'tools/nni_annotation',
'nni': 'src/sdk/pynni/nni',
'nnicmd': 'tools/nnicmd'
'nnicmd': 'tools/nnicmd',
'trial_tool':'tools/trial_tool'
},
python_requires = '>=3.5',
install_requires = [
Expand All @@ -59,7 +60,8 @@ def run(self):
'pyyaml',
'requests',
'scipy',
'schema'
'schema',
'pyhdfs'
],

cmdclass={
Expand Down
18 changes: 16 additions & 2 deletions src/nni_manager/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -225,5 +225,19 @@ function cleanupUnitTest(): void {
Container.restore(ExperimentStartupInfo);
}

export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest,
parseArg, cleanupUnitTest, uniqueString, randomSelect };
/**
* Get IPv4 address of current machine
*/
function getIPV4Address(): string {
yds05 marked this conversation as resolved.
Show resolved Hide resolved
let ipv4Address : string = '';

for(const item of os.networkInterfaces().eth0) {
if(item.family === 'IPv4') {
ipv4Address = item.address;
}
}
return ipv4Address;
}

export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
7 changes: 5 additions & 2 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'


function initStartupInfo(startExpMode: string, resumeExperimentId: string) {
Expand All @@ -49,6 +50,8 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton);
} else if (platformMode === 'remote') {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'pai'){
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else {
throw new Error(`Error: unsupported mode: ${mode}`);
}
Expand All @@ -61,7 +64,7 @@ async function initContainer(platformMode: string): Promise<void> {
}

function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>');
console.info('usage: node main.js --port <port> --mode <local/remote/pai> --start_mode <new/resume> --experiment_id <id>');
}

let port: number = NNIRestServer.DEFAULT_PORT;
Expand All @@ -71,7 +74,7 @@ if (strPort && strPort.length > 0) {
}

const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote'].includes(mode)) {
if (!['local', 'remote', 'pai'].includes(mode)) {
usage();
process.exit(1);
}
Expand Down
4 changes: 3 additions & 1 deletion src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
"tree-kill": "^1.2.0",
"ts-deferred": "^1.0.4",
"typescript-ioc": "^1.2.4",
"typescript-string-operations": "^1.3.1"
"typescript-string-operations": "^1.3.1",
"webhdfs":"^1.2.0"
},
"devDependencies": {
"@types/chai": "^4.1.4",
Expand All @@ -40,6 +41,7 @@
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"rmdir": "^1.2.0",
"tmp": "^0.0.33",
"ts-node": "^7.0.0",
"tslint": "^5.11.0",
Expand Down
14 changes: 12 additions & 2 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,19 @@ export namespace ValidationSchemas {
passphrase: joi.string()
})),
trial_config: joi.object({
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be better to improve this validation, this mixes the validation of trial's local/remote mode and pai mode.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, @SparkSnail will update this PR to improve the validation

gpuNum: joi.number().min(0).required(),
image: joi.string().min(1),
codeDir: joi.string().min(1).required(),
command: joi.string().min(1).required()
dataDir: joi.string(),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
pai_config: joi.object({
userName: joi.string().min(1).required(),
passWord: joi.string().min(1).required(),
host: joi.string().min(1).required()
})
}
};
Expand Down
37 changes: 37 additions & 0 deletions src/nni_manager/training_service/common/jobMetrics.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

import { TrialJobStatus } from '../../common/trainingService';

// tslint:disable-next-line:max-classes-per-file
export class JobMetrics {
public readonly jobId: string;
public readonly metrics: string[];
public readonly jobStatus: TrialJobStatus;
public readonly endTimestamp: number;

constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) {
this.jobId = jobId;
this.metrics = metrics;
this.jobStatus = jobStatus;
this.endTimestamp = endTimestamp;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey {
MACHINE_LIST = 'machine_list',
TRIAL_CONFIG = 'trial_config',
EXPERIMENT_ID = 'experimentId',
RANDOM_SCHEDULER = 'random_scheduler'
RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config'
}
200 changes: 200 additions & 0 deletions src/nni_manager/training_service/pai/hdfsClientUtility.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

import * as path from 'path';
import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { getLogger } from '../../common/log';

/**
* HDFS client utility, including copy file/directory
*/
export namespace HDFSClientUtility {
/**
* Copy a local file to hdfs directory
*
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
export async function copyFileToHdfs(localFilePath : string, hdfsFilePath : string, hdfsClient : any) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
fs.exists(localFilePath, (exists : boolean) => {
// Detect if local file exist
if (exists) {
var localFileStream = fs.createReadStream(localFilePath);
var hdfsFileStream = hdfsClient.createWriteStream(hdfsFilePath);
localFileStream.pipe(hdfsFileStream);
hdfsFileStream.on('finish', function onFinish () {
deferred.resolve();
});
hdfsFileStream.on('error', (err : any) => {
getLogger().error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`);
deferred.reject(err);
});
} else {
getLogger().error(`HDFSCientUtility:copyFileToHdfs, ${localFilePath} doesn't exist locally`);
deferred.reject('file not exist!');
}
});
return deferred.promise;
}

/**
* Recursively copy local directory to hdfs directory
*
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
*/
export async function copyDirectoryToHdfs(localDirectory : string, hdfsDirectory : string, hdfsClient : any) : Promise<void>{
const deferred: Deferred<void> = new Deferred<void>();
// TODO: fs.readdirSync doesn't support ~($HOME)
const fileNameArray: string[] = fs.readdirSync(localDirectory);

for(var fileName of fileNameArray){
const fullFilePath: string = path.join(localDirectory, fileName);
try {
if (fs.lstatSync(fullFilePath).isFile()) {
await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient);
} else {
// If filePath is a directory, recuisively copy it to remote directory
await copyDirectoryToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient);
}
} catch(error) {
deferred.reject(error);
}
}
// All files/directories are copied successfully, resolve
deferred.resolve();

return deferred.promise;
}

/**
* Read content from HDFS file
*
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
export async function readFileFromHDFS(hdfsPath : string, hdfsClient :any) : Promise<Buffer> {
const deferred: Deferred<Buffer> = new Deferred<Buffer>();
let buffer : Buffer = Buffer.alloc(0);

const exist : boolean = await pathExists(hdfsPath, hdfsClient);
if(!exist) {
deferred.reject(`${hdfsPath} doesn't exists`);
}

const remoteFileStream = hdfsClient.createReadStream(hdfsPath);
remoteFileStream.on('error', (err : any) => {
// Reject with the error
deferred.reject(err);
});

remoteFileStream.on('data', (chunk : any) => {
// Concat the data chunk to buffer
buffer = Buffer.concat([buffer, chunk]);
});

remoteFileStream.on('finish', function onFinish () {
// Upload is done, resolve
deferred.resolve(buffer);
});

return deferred.promise;
}

/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export async function pathExists(hdfsPath : string, hdfsClient : any) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.exists(hdfsPath, (exist : boolean ) => {
deferred.resolve(exist);
})

return deferred.promise;
}

/**
* Mkdir in HDFS, use default permission 755
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
*/
export function mkdir(hdfsPath : string, hdfsClient : any) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();

hdfsClient.mkdir(hdfsPath, (err : any)=> {
if(!err) {
deferred.resolve(true);
} else {
deferred.reject(err.message);
}
});

return deferred.promise;
}

/**
* Read directory contents
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
*/
export async function readdir(hdfsPath : string, hdfsClient : any) : Promise<string[]> {
const deferred : Deferred<string[]> = new Deferred<string[]>();
const exist : boolean = await pathExists(hdfsPath, hdfsClient);
if(!exist) {
deferred.reject(`${hdfsPath} doesn't exists`);
}

hdfsClient.readdir(hdfsPath, (err : any, files : any[] ) => {
if(err) {
deferred.reject(err);
}

deferred.resolve(files);
});

return deferred.promise;
}

/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param recursive Mark if need to delete recursively
*/
export function deletePath(hdfsPath : string, hdfsClient : any, recursive : boolean = true) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.unlink(hdfsPath, recursive, (err : any)=> {
if(!err) {
deferred.resolve(true);
} else {
deferred.reject(err.message);
}
});
return deferred.promise;
}
}
Loading