Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Kubeflow TrainingService support, v1 #373

Merged
merged 35 commits into from
Nov 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
bc8ee41
Merge pull request #2 from Microsoft/master
yds05 Nov 8, 2018
aedb000
Change base image from devel to runtime, to reduce docker image size
Nov 8, 2018
69e9de0
Support running multiple experiment for PAI
Nov 9, 2018
53c2cd9
Fix a bug regarding to recuisively reference between paiRestServer and
Nov 12, 2018
bd5dcd7
Initial version for Kubeflow Training Service
Nov 12, 2018
1e40376
Merge pull request #3 from Microsoft/master
yds05 Nov 12, 2018
575cbb9
Merge branch 'master' into kubeflow
Nov 12, 2018
fd052ca
simple refactor
Nov 12, 2018
b5d4e9d
Merge pull request #4 from Microsoft/master
yds05 Nov 13, 2018
c83a2ce
Merge branch 'master' into kubeflow
Nov 13, 2018
7e04718
Remove unused 51189 const variable
Nov 13, 2018
3eaea4f
Support launch kubeflow training service through nnictl
Nov 13, 2018
7d4e3ca
Enable kubeflow platfrom in NNI sdk
Nov 13, 2018
5f3f069
Add shell comand to launch kubeflow trial job
Nov 14, 2018
4bcda91
Test for ubuntu nFS logic
Nov 14, 2018
2e11a68
Fix trial nfs folder small bug
Nov 14, 2018
33d24c6
fix small bug
Nov 14, 2018
a2164ee
fix small bug
Nov 14, 2018
f63e16b
Move get_sequence_id() from reading sequence_id file to read env param
Nov 15, 2018
2b29427
Add cancelJob support for Kubeflow training service
Nov 15, 2018
e4e27e7
Add some delay to check kubeflow job status
Nov 15, 2018
8abd518
Fix a bug caused by error checking result.stderr
Nov 15, 2018
9bba10f
Add cleanup function for kubeflow training service
Nov 15, 2018
7256a82
Move mount operation from submitJob to setClusterMetadata
Nov 15, 2018
6efec43
Change set trial sequence id, change from upload sequence_id file to
Nov 16, 2018
634a464
Change NNI_OUT_DIR to nfs path
Nov 16, 2018
01171c4
Add file server log path for Kubeflow trial job
Nov 16, 2018
baedae3
Correct nfs log url path
Nov 16, 2018
b376d5b
Code refactor: move /tmp/nfs to const var, and use path join for nfs …
Nov 16, 2018
226fc16
Add install and check NNI package
Nov 16, 2018
d7f4556
Merge pull request #6 from Microsoft/master
yds05 Nov 16, 2018
7da87c4
Merge pull request #7 from yds05/master
yds05 Nov 16, 2018
6e44618
Replace hard-coded tfjobs with kubeflow job plural name
Nov 19, 2018
a8a97c6
Fix wrongly import location in kubeflowconfig.ts
Nov 19, 2018
83264fb
Fix typos and remove unnecessary ut config
Nov 19, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions src/nni_manager/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'

import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubeflow/kubeflowTrainingService';

function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new');
Expand All @@ -52,6 +52,8 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'pai') {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
} else {
throw new Error(`Error: unsupported mode: ${mode}`);
}
Expand All @@ -76,19 +78,22 @@ if (!strPort || strPort.length === 0) {
const port: number = parseInt(strPort, 10);

const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai'].includes(mode)) {
if (!['local', 'remote', 'pai', 'kubeflow'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage();
process.exit(1);
}

const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) {
console.log(`FATAL: unknown start_mode: ${startMode}`);
usage();
process.exit(1);
}

const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) {
console.log(`FATAL: cannot resume experiment, invalid experiment_id: ${experimentId}`);
usage();
process.exit(1);
}
Expand Down
1 change: 1 addition & 0 deletions src/nni_manager/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"express": "^4.16.3",
"express-joi-validator": "^2.0.0",
"node-nvidia-smi": "^1.0.0",
"node-yaml": "^3.1.1",
"rx": "^4.1.0",
"sqlite3": "^4.0.2",
"ssh2": "^0.6.1",
Expand Down
8 changes: 8 additions & 0 deletions src/nni_manager/rest_server/restValidationSchemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ export namespace ValidationSchemas {
userName: joi.string().min(1).required(),
passWord: joi.string().min(1).required(),
host: joi.string().min(1).required()
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
}).required(),
kubernetesServer: joi.string().min(1).required()
})
}
};
Expand Down
96 changes: 96 additions & 0 deletions src/nni_manager/training_service/common/clusterJobRestServer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo';
import { RestServer } from '../../common/restServer'

/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*/
@component.Singleton
export abstract class ClusterJobRestServer extends RestServer{
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';

private readonly expId: string = getExperimentId();

/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);

this.port = basePort + 1;
}

public get clusterRestServerPort(): number {
if(!this.port) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}

/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}

private createRestHandler() : Router {
const router: Router = Router();

// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});

router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);

this.handleTrialMetrics(req.body.jobId, req.body.metrics);

res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});

return router;
}

/** Abstract method to handle trial metrics data */
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
}
30 changes: 30 additions & 0 deletions src/nni_manager/training_service/common/containerJobData.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

export const CONTAINER_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user --upgrade nni
fi`;
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ export enum TrialConfigMetadataKey {
EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase',
RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config'
PAI_CLUSTER_CONFIG = 'pai_config',
KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config'
}
93 changes: 93 additions & 0 deletions src/nni_manager/training_service/kubeflow/kubeflowConfig.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import { TrialConfig } from "../common/trialConfig";

/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';


/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';

/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
]);

/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs: NFSConfig;
public readonly kubernetesServer: string;

/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs : NFSConfig, kubernetesServer : string) {
this.operator = operator;
this.nfs = nfs;
this.kubernetesServer = kubernetesServer;
}
}

/**
* NFS configuration to store Kubeflow job related files
*/
export class NFSConfig {
/** IP Adress of NFS server */
public readonly server : string;
/** exported NFS path on NFS server */
public readonly path : string;

constructor(server : string, path : string) {
this.server = server;
this.path = path;
}
}

/**
* Trial job configuration for Kubeflow
*/
export class KubeflowTrialConfig extends TrialConfig {
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;

constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
}
}
78 changes: 78 additions & 0 deletions src/nni_manager/training_service/kubeflow/kubeflowData.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

'use strict';

import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';

/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export class KubeflowTrialJobDetail implements TrialJobDetail {
public id: string;
public status: TrialJobStatus;
public submitTime: number;
public startTime?: number;
public endTime?: number;
public tags?: string[];
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public kubeflowJobName: string;
public sequenceId: number;
public queryJobFailedCount: number;
public k8sPluralName: string

constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string, k8sPluralName: string) {
this.id = id;
this.status = status;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.kubeflowJobName = kubeflowJobName;
this.sequenceId = sequenceId;
this.tags = [];
this.queryJobFailedCount = 0;
this.url = url;
this.k8sPluralName = k8sPluralName;
}
}

export const KUBEFLOW_RUN_SHELL_FORMAT: string =
`#!/bin/bash
export NNI_PLATFORM=kubeflow
export NNI_SYS_DIR={0}
export NNI_OUTPUT_DIR={1}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={2}
export NNI_EXP_ID={3}
export NNI_CODE_DIR={4}
export NNI_TRIAL_SEQ_ID={5}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh # Check and install NNI pkg
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR//trialkeeper_stderr
`

export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded';
Loading