-
Notifications
You must be signed in to change notification settings - Fork 1.8k
DLTS integration #1945
DLTS integration #1945
Changes from 21 commits
08d31c0
5426c11
c84bdc5
a60ca84
56cc475
7f97021
601f6a9
2cbda0a
38d7e89
c08da76
d5933d5
2f1beb2
b58ae28
bca20fd
01b553c
fc611ec
7251af0
3f1cbc4
1a8f05c
e4d8ab2
b0f4e41
cc5b28e
d19ecab
4a8b290
860e0af
fecc619
a340df2
c831f40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
**Run an Experiment on Deep Learning Training Service** | ||
=== | ||
NNI supports running an experiment on [Deep Learning Training Service](https://github.com/microsoft/DLWorkspace.git) (aka DLTS), called dlts mode. Before starting to use NNI dlts mode, you should have an account to access DLTS dashboard. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to DLTS |
||
|
||
## Setup Environment | ||
|
||
Step 1. Choose a cluster from DLTS dashboard, ask administrator for the cluster dashboard URL. | ||
|
||
 | ||
|
||
Step 2. Prepare a NNI config YAML like the following: | ||
|
||
```yaml | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of directly post an yaml example, you might want to outline the new field like trainingServicePlatform: dlts, or additional keys comparing to LocalMode and RemoteMachineMode, refer to https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed all comments here other than DLTS specified ones. |
||
trainingServicePlatform: dlts | ||
authorName: your_name | ||
experimentName: auto_mnist | ||
# how many trials could be concurrently running | ||
trialConcurrency: 2 | ||
# maximum experiment running duration | ||
maxExecDuration: 3h | ||
# empty means never stop | ||
maxTrialNum: 100 | ||
# search space file | ||
searchSpacePath: search_space.json | ||
# choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
builtinTunerName: TPE | ||
classArgs: | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
gpuNum: 1 | ||
image: msranni/nni | ||
dltsConfig: | ||
dashboard: # Ask administrator for the cluster dashboard URL | ||
``` | ||
|
||
Remember to fill the cluster dashboard URL to the last line. | ||
|
||
Step 3. Open your working directory of the cluster, paste the NNI config as well as related code to a directory. | ||
|
||
 | ||
|
||
Step 4. Submit a NNI manager job to the specified cluster. | ||
|
||
 | ||
|
||
Step 5. Go to Endpoints tab of the newly created job, click the Port 40000 link to theck trial's information. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
|
||
 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
debug: true | ||
authorName: default | ||
experimentName: example_mnist | ||
trialConcurrency: 1 | ||
maxExecDuration: 1h | ||
maxTrialNum: 10 | ||
#choice: local, remote, pai | ||
trainingServicePlatform: dlts | ||
searchSpacePath: search_space.json | ||
#choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner | ||
#SMAC (SMAC should be installed through nnictl) | ||
builtinTunerName: TPE | ||
classArgs: | ||
#choice: maximize, minimize | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
gpuNum: 1 | ||
#The docker image to run nni job on dlts | ||
image: msranni/nni:latest | ||
dltsConfig: | ||
SparkSnail marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dashboard: http://azure-eastus-p40-dev1-infra01.eastus.cloudapp.azure.com/ | ||
|
||
# The following fields are all optional and could be retrieved from environment | ||
# variables if running in DLTS job container. | ||
|
||
# cluster: .default | ||
# team: platform | ||
# email: example@microsoft.com | ||
# password: # Paste from DLTS dashboard |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
export interface DLTSClusterConfig { | ||
dashboard: string; | ||
|
||
cluster: string; | ||
team: string; | ||
|
||
email: string; | ||
password: string; | ||
|
||
gpuType?: string; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
export const DLTS_TRIAL_COMMAND_FORMAT: string = | ||
`export NNI_PLATFORM=dlts NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ | ||
&& cd $NNI_SYS_DIR && sh install_nni.sh \ | ||
&& cd '{6}' && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' \ | ||
--nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}'`; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
import { DLTSClusterConfig } from "./dltsClusterConfig"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you please add license at file beginning and a empty line at end? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
export class DLTSJobConfig { | ||
public readonly team: string; | ||
public readonly userName: string; | ||
public readonly vcName: string; | ||
public readonly gpuType: string; | ||
public readonly jobType = "training"; | ||
public readonly jobtrainingtype = "RegularJob"; | ||
public readonly ssh = false; | ||
public readonly ipython = false; | ||
public readonly tensorboard = false; | ||
public readonly workPath = ''; | ||
public readonly enableworkpath = true; | ||
public readonly dataPath = ''; | ||
public readonly enabledatapath = false; | ||
public readonly jobPath = ''; | ||
public readonly enablejobpath = true; | ||
public readonly mountpoints = []; | ||
public readonly env = [] | ||
public readonly hostNetwork = false; | ||
public readonly useGPUTopology = false; | ||
public readonly isPrivileged = false; | ||
public readonly hostIPC = false; | ||
public readonly preemptionAllowed = "False" | ||
|
||
public constructor( | ||
clusterConfig: DLTSClusterConfig, | ||
public readonly jobName: string, | ||
public readonly resourcegpu: number, | ||
public readonly image: string, | ||
public readonly cmd: string, | ||
public readonly interactivePorts: number[], | ||
) { | ||
if (clusterConfig.gpuType === undefined) { | ||
throw Error('GPU type not fetched') | ||
} | ||
this.vcName = this.team = clusterConfig.team | ||
this.gpuType = clusterConfig.gpuType | ||
this.userName = clusterConfig.email | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
'use strict'; | ||
|
||
import { Request, Response, Router } from 'express'; | ||
import { Inject } from 'typescript-ioc'; | ||
import * as component from '../../common/component'; | ||
import { ClusterJobRestServer } from '../common/clusterJobRestServer'; | ||
import { DLTSTrainingService } from './dltsTrainingService'; | ||
|
||
export interface ParameterFileMeta { | ||
readonly experimentId: string; | ||
readonly trialId: string; | ||
readonly filePath: string; | ||
} | ||
|
||
/** | ||
* DLTS Training service Rest server, provides rest API to support DLTS job metrics update | ||
* | ||
*/ | ||
@component.Singleton | ||
export class DLTSJobRestServer extends ClusterJobRestServer { | ||
private parameterFileMetaList: ParameterFileMeta[] = []; | ||
|
||
@Inject | ||
private readonly dltsTrainingService: DLTSTrainingService; | ||
|
||
/** | ||
* constructor to provide NNIRestServer's own rest property, e.g. port | ||
*/ | ||
constructor() { | ||
super(); | ||
this.dltsTrainingService = component.get(DLTSTrainingService); | ||
} | ||
|
||
// tslint:disable-next-line:no-any | ||
protected handleTrialMetrics(jobId: string, metrics: any[]): void { | ||
// Split metrics array into single metric, then emit | ||
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN | ||
for (const singleMetric of metrics) { | ||
this.dltsTrainingService.MetricsEmitter.emit('metric', { | ||
id : jobId, | ||
data : singleMetric | ||
}); | ||
} | ||
} | ||
|
||
protected createRestHandler(): Router { | ||
const router: Router = super.createRestHandler(); | ||
|
||
router.post(`/parameter-file-meta`, (req: Request, res: Response) => { | ||
try { | ||
this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); | ||
this.parameterFileMetaList.push(req.body); | ||
res.send(); | ||
} catch (err) { | ||
this.log.error(`POST parameter-file-meta error: ${err}`); | ||
res.status(500); | ||
res.send(err.message); | ||
} | ||
}); | ||
|
||
router.get(`/parameter-file-meta`, (req: Request, res: Response) => { | ||
try { | ||
this.log.info(`GET /parameter-file-meta`); | ||
res.send(this.parameterFileMetaList); | ||
} catch (err) { | ||
this.log.error(`GET parameter-file-meta error: ${err}`); | ||
res.status(500); | ||
res.send(err.message); | ||
} | ||
}); | ||
|
||
return router; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we use the official name (DLWorkspace) of the project here? or the well known aka. DLTS.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping @hongzhili for suggestion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to DLTS