From 64a59c4cb4a81de2604872e509740e04234d88f0 Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Sat, 31 Jul 2021 04:51:14 +0800 Subject: [PATCH 1/5] dlc: init dlc & sumit dlc & start trial_runner --- examples/trials/mnist-pytorch/config_dlc.yml | 20 ++ nni/experiment/config/__init__.py | 1 + nni/experiment/config/dlc.py | 24 +++ nni/tools/nnictl/config_schema.py | 22 +- nni/tools/nnictl/config_utils.py | 6 + nni/tools/nnictl/ts_management.py | 1 + ts/nni_manager/common/experimentConfig.ts | 16 ++ ts/nni_manager/config/dlc/dlcUtil.py | 81 ++++++++ ts/nni_manager/main.ts | 2 +- .../reusable/dlc/dlcClient.ts | 188 ++++++++++++++++++ .../reusable/dlc/dlcConfig.ts | 40 ++++ .../environments/dlcEnvironmentService.ts | 135 +++++++++++++ .../environments/environmentServiceFactory.ts | 3 + 13 files changed, 537 insertions(+), 2 deletions(-) create mode 100644 examples/trials/mnist-pytorch/config_dlc.yml create mode 100644 nni/experiment/config/dlc.py create mode 100644 ts/nni_manager/config/dlc/dlcUtil.py create mode 100644 ts/nni_manager/training_service/reusable/dlc/dlcClient.ts create mode 100644 ts/nni_manager/training_service/reusable/dlc/dlcConfig.ts create mode 100644 ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml new file mode 100644 index 0000000000..d002d20995 --- /dev/null +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -0,0 +1,20 @@ +searchSpaceFile: search_space.json +trialCommand: python3 mnist.py +trialConcurrency: 1 +maxTrialNumber: 10 +tuner: + name: TPE + classArgs: + optimize_mode: maximize +# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x +trainingService: + platform: dlc + type: Worker + image: registry-vpc.cn-hangzhou.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04 + podCount: 1 + ecsSpec: ecs.c6.large + region: cn-hangzhou + nasDataSourceId: ${your_nas_data_source_id} + accessKeyId: ${your_ak_id} + accessKeySecret: ${your_ak_key} + nasDataSourceId: ${your_nas_data_source_id} #e.g., datat56by9n1xt0a diff --git a/nni/experiment/config/__init__.py b/nni/experiment/config/__init__.py index cc7feefdbd..df4db48058 100644 --- a/nni/experiment/config/__init__.py +++ b/nni/experiment/config/__init__.py @@ -9,4 +9,5 @@ from .kubeflow import * from .frameworkcontroller import * from .adl import * +from .dlc import * from .shared_storage import * diff --git a/nni/experiment/config/dlc.py b/nni/experiment/config/dlc.py new file mode 100644 index 0000000000..7b59111fb1 --- /dev/null +++ b/nni/experiment/config/dlc.py @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from dataclasses import dataclass + +from .common import TrainingServiceConfig + +__all__ = ['DlcConfig'] + +@dataclass(init=False) +class DlcConfig(TrainingServiceConfig): + platform: str = 'dlc' + type: str = 'Worker' + image: str # 'registry-vpc.{region}.aliyuncs.com/pai-dlc/tensorflow-training:1.15.0-cpu-py36-ubuntu18.04', + pod_count: int + ecs_spec: str # e.g.,'ecs.c6.large' + region: str + nas_data_source_id: str + access_key_id: str + access_key_secret: str + + _validation_rules = { + 'platform': lambda value: (value == 'dlc', 'cannot be modified') + } diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index ae496f2600..0286b1b00a 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -130,7 +130,7 @@ def validate(self, data): Optional('maxTrialDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxTrialDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice( - 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'), + 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid', 'dlc'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), Optional('multiPhase'): setType('multiPhase', bool), Optional('multiThread'): setType('multiThread', bool), @@ -267,6 +267,25 @@ def validate(self, data): } } +dlc_trial_schema = { + 'trial': { + 'codeDir': setPathCheck('codeDir'), + 'command': setType('command', str), + 'image': setType('image', str), + } +} + +dlc_config_schema = { + Optional('dlcConfig'): { + 'type': setType('type', str), + 'image': setType('image', str), + 'podCount': setType('podCount', int), + 'ecsSpec': setType('ecsSpec', str), + 'region': setType('region', str), + 'nasDataSourceId': setType('nasDataSourceId', str), + } +} + hybrid_trial_schema = { 'trial': { 'codeDir': setPathCheck('codeDir'), @@ -477,6 +496,7 @@ def validate(self, data): 'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}), 'hybrid': Schema({**common_schema, **hybrid_trial_schema, **hybrid_config_schema, **machine_list_schema, **pai_config_schema, **aml_config_schema, **remote_config_schema}), + 'dlc': Schema({**common_schema, **dlc_trial_schema, **dlc_config_schema}), } diff --git a/nni/tools/nnictl/config_utils.py b/nni/tools/nnictl/config_utils.py index 78f884e6f0..67ec91dbe1 100644 --- a/nni/tools/nnictl/config_utils.py +++ b/nni/tools/nnictl/config_utils.py @@ -68,6 +68,12 @@ def _inverse_cluster_metadata(platform: str, metadata_config: list) -> dict: inverse_config['amlConfig'] = kv['value'] elif kv['key'] == 'trial_config': inverse_config['trial'] = kv['value'] + elif platform == 'dlc': + for kv in metadata_config: + if kv['key'] == 'dlc_config': + inverse_config['dlcConfig'] = kv['value'] + elif kv['key'] == 'trial_config': + inverse_config['trial'] = kv['value'] elif platform == 'adl': for kv in metadata_config: if kv['key'] == 'adl_config': diff --git a/nni/tools/nnictl/ts_management.py b/nni/tools/nnictl/ts_management.py index 151b053591..8703c0e92f 100644 --- a/nni/tools/nnictl/ts_management.py +++ b/nni/tools/nnictl/ts_management.py @@ -9,6 +9,7 @@ 'remote', 'openpai', 'pai', 'aml', + 'dlc' 'kubeflow', 'frameworkcontroller', 'adl', diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 2b69da0dfe..1ca7a5c7ac 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -73,6 +73,22 @@ export interface AmlConfig extends TrainingServiceConfig { maxTrialNumberPerGpu: number; } + +/* Alibaba PAI DLC */ +export interface DlcConfig extends TrainingServiceConfig { + platfrom: 'dlc'; + type: string; + image: string; + podCount: number; + ecsSpec: string; + region: string; + nasDataSourceId: string; + accessKeyId: string; + accessKeySecret: string; +} +/* Kubeflow */ + +// FIXME: merge with shared storage config export interface KubeflowStorageConfig { storageType: string; maxTrialNumberPerGpu?: number; diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py new file mode 100644 index 0000000000..8864954c76 --- /dev/null +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -0,0 +1,81 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import sys +import time +import json +from argparse import ArgumentParser +# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x +from alibabacloud_pai_dlc20201203.client import Client +from alibabacloud_tea_openapi.models import Config +from alibabacloud_pai_dlc20201203.models import * #CreateJobRequest, JobSpec + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument('--type', help='the type of pod') + parser.add_argument('--image', help='the docker image of job') + parser.add_argument('--pod_count', type=int, default=1, help='pod count') + parser.add_argument('--ecs_spec', help='ecs spec') + parser.add_argument('--region', help='region') + parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration') + parser.add_argument('--access_key_id', help='access_key_id') + parser.add_argument('--access_key_secret', help='access_key_secret') + parser.add_argument('--experiment_name', help='the experiment name') + parser.add_argument('--script_dir', help='script directory') + parser.add_argument('--user_command', help='user command') + args = parser.parse_args() + + # init client + client = Client( + Config( + access_key_id=args.access_key_id, + access_key_secret=args.access_key_secret, + region_id=args.region, + endpoint=f'pai-dlc.{args.region}.aliyuncs.com' + ) + ) + + nas_1 = DataSourceItem( + data_source_type = 'nas', + data_source_id=args.nas_data_source_id, + ) + + # job spec + spec = JobSpec( + type='Worker', + image=args.image, + pod_count=args.pod_count, + ecs_spec=args.ecs_spec, + ) + + job_type = 'TFJob' if args.image.find('tensorflow') >= 0 else 'PyTorchJob' + req = CreateJobRequest( + display_name=args.experiment_name, + job_type=job_type, + job_specs=[spec], + data_sources=[nas_1], + user_command=args.user_command + ) + + # DLC submit + response = client.create_job(req) + job_id = response.body.job_id + print('job id: ' + job_id) + + while True: + line = sys.stdin.readline().rstrip() + if line == 'update_status': + print('status:' + client.get_job(job_id).body) + elif line == 'tracking_url': + #TODO: 1. get this url by api? 2. change this url in private dlc mode. + print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}') + elif line == 'stop': + client.stop_job() + exit(0) + elif line == 'receive': + print('receive:' + json.dumps(run.get_metrics())) + elif line: + items = line.split(':') + if items[0] == 'command': + run.log('nni_manager', line[8:]) diff --git a/ts/nni_manager/main.ts b/ts/nni_manager/main.ts index 360dc8148c..31587a7f61 100644 --- a/ts/nni_manager/main.ts +++ b/ts/nni_manager/main.ts @@ -63,7 +63,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN function usage(): void { console.info('usage: node main.js --port --mode \ - --start_mode --experiment_id --foreground '); + --start_mode --experiment_id --foreground '); } const strPort: string = parseArg(['--port', '-p']); diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts new file mode 100644 index 0000000000..f0e5b56115 --- /dev/null +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -0,0 +1,188 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { Deferred } from 'ts-deferred'; +import { PythonShell } from 'python-shell'; +import { getLogger, Logger } from '../../../common/log'; + +export class DlcClient { + private log: Logger; + public type: string; + public image: string; + public podCount: number; + public ecsSpec: string; + public region: string; + // e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC + // create a NAS data and copy the 'DataSet ConfigurationID' + public nasDataSourceId: string; + public accessKeyId: string; + public accessKeySecret: string; + public experimentId: string; + public environmentId: string; + public userCommand: string; + public pythonShellClient: undefined | PythonShell; + public codeDir: string; + + constructor( + type: string, + image: string, + podCount: number, + experimentId: string, + environmentId: string, + ecsSpec: string, + region: string, + nasDataSourceId: string, + accessKeyId: string, + accessKeySecret: string, + userCommand: string, + codeDir: string, + ) { + this.log = getLogger('DlcClient'); + this.type = type; + this.image = image; + this.podCount = podCount; + this.ecsSpec = ecsSpec; + this.image = image; + this.region = region; + this.nasDataSourceId = nasDataSourceId; + this.accessKeyId = accessKeyId; + this.accessKeySecret = accessKeySecret + this.experimentId = experimentId; + this.environmentId = environmentId; + this.userCommand = userCommand; + this.codeDir = codeDir; + } + + private getScript(): string[] { + const script: string[] = []; + script.push( + `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --pod_count ${this.podCount} ` + + `--ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + + `--region ${this.region} --nas_data_source_id ${this.nasDataSourceId} --access_key_id ${this.accessKeyId} ` + + `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}" --script_dir ${this.codeDir}` ); + return script; + } + + public submit(): Promise { + const deferred: Deferred = new Deferred(); + this.pythonShellClient = new PythonShell('dlcUtil.py', { + scriptPath: './config/dlc', + pythonPath: 'python', + pythonOptions: ['-u'], // get print results in real-time + args: [ + '--type', this.type, + '--image', this.image, + '--pod_count', String(this.podCount), + '--ecs_spec', this.ecsSpec, + '--region', this.region, + '--nas_data_source_id', this.nasDataSourceId, + '--access_key_id', this.accessKeyId, + '--access_key_secret', this.accessKeySecret, + '--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`, + '--script_dir', this.codeDir, + '--user_command', this.userCommand, + ] + }); + this.log.debug(this.pythonShellClient.command); + this.pythonShellClient.on('message', function (envId: any) { + // received a message sent from the Python script (a simple "print" statement) + deferred.resolve(envId); + }); + this.monitorError(this.pythonShellClient, deferred); + return deferred.promise; + + return deferred.promise; + } + + public stop(): void { + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.pythonShellClient.send('stop'); + } + + public getTrackingUrl(): Promise { + const deferred: Deferred = new Deferred(); + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.pythonShellClient.send('tracking_url'); + this.pythonShellClient.on('message', (status: any) => { + const trackingUrl = this.parseContent('tracking_url', status); + if (trackingUrl !== '') { + deferred.resolve(trackingUrl); + } + }); + this.monitorError(this.pythonShellClient, deferred); + return deferred.promise; + } + + public updateStatus(oldStatus: string): Promise { + const deferred: Deferred = new Deferred(); + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.pythonShellClient.send('update_status'); + var log = this.log; + this.pythonShellClient.on('message', (status: any) => { + log.debug(`updateStatus: message ${status}`); + let newStatus = this.parseContent('status', status); + if (newStatus === '') { + newStatus = oldStatus; + } + deferred.resolve(newStatus); + }); + this.monitorError(this.pythonShellClient, deferred); + return deferred.promise; + } + + public sendCommand(message: string): void { + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.log.debug(`command:${message}`); + this.pythonShellClient.send(`command:${message}`); + } + + public receiveCommand(): Promise { + const deferred: Deferred = new Deferred(); + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.pythonShellClient.send('receive'); + var log = this.log; + this.pythonShellClient.on('message', (command: any) => { + log.debug(`message*** ${command}`); + const message = this.parseContent('receive', command); + if (message !== '') { + deferred.resolve(JSON.parse(message)) + } + }); + this.monitorError(this.pythonShellClient, deferred); + return deferred.promise; + } + + // Monitor error information in dlc python shell client + private monitorError(pythonShellClient: PythonShell, deferred: Deferred): void { + var log = this.log; + pythonShellClient.on('error', function (error: any) { + log.debug(`error*** ${error}`); + deferred.reject(error); + }); + pythonShellClient.on('close', function (error: any) { + log.debug(`close*** ${error}`); + deferred.reject(error); + }); + } + + // Parse command content, command format is {head}:{content} + public parseContent(head: string, command: string): string { + const items = command.split(':'); + if (items[0] === head) { + return command.slice(head.length + 1); + } + return ''; + } +} diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcConfig.ts b/ts/nni_manager/training_service/reusable/dlc/dlcConfig.ts new file mode 100644 index 0000000000..0bdff76304 --- /dev/null +++ b/ts/nni_manager/training_service/reusable/dlc/dlcConfig.ts @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { TrialConfig } from '../../common/trialConfig'; +import { EnvironmentInformation } from '../environment'; +import { DlcClient } from '../dlc/dlcClient'; + +export class DlcClusterConfig { + public readonly type: string; + public readonly image: string; + public readonly podCount: number; + public readonly ecsSpec: string; + + constructor(type: string, image: string, podCount: number, ecsSpec: string) { + this.type = type; + this.image = image; + this.podCount = podCount; + this.ecsSpec = ecsSpec; + } +} + +export class DlcTrialConfig extends TrialConfig { + public readonly image: string; + public readonly command: string; + public readonly codeDir: string; + + constructor(codeDir: string, command: string, image: string) { + super("", codeDir, 0); + this.codeDir = codeDir; + this.command = command; + this.image = image; + } +} + +export class DlcEnvironmentInformation extends EnvironmentInformation { + public dlcClient?: DlcClient; + public currentMessageIndex: number = -1; +} diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts new file mode 100644 index 0000000000..c7a37a75a8 --- /dev/null +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -0,0 +1,135 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as fs from 'fs'; +import * as path from 'path'; +import * as component from '../../../common/component'; +import { getLogger, Logger } from '../../../common/log'; +import { ExperimentConfig, DlcConfig, flattenConfig } from '../../../common/experimentConfig'; +import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo'; +import { validateCodeDir } from '../../common/util'; +import { DlcClient } from '../dlc/dlcClient'; +import { DlcEnvironmentInformation } from '../dlc/dlcConfig'; +import { EnvironmentInformation, EnvironmentService } from '../environment'; +import { EventEmitter } from "events"; +import { FileCommandChannel } from '../channels/fileCommandChannel'; + +interface FlattenDlcConfig extends ExperimentConfig, DlcConfig { } + +/** + * Collector DLC jobs info from DLC cluster, and update dlc job status locally + */ +@component.Singleton +export class DlcEnvironmentService extends EnvironmentService { + + private readonly log: Logger = getLogger('dlcEnvironmentService'); + private experimentId: string; + private experimentRootDir: string; + private config: FlattenDlcConfig; + + constructor(config: ExperimentConfig, info: ExperimentStartupInfo) { + super(); + this.experimentId = info.experimentId; + this.experimentRootDir = info.logDir; + this.config = flattenConfig(config, 'dlc'); + validateCodeDir(this.config.trialCodeDirectory); + } + + public get hasStorageService(): boolean { + return false; + } + + public initCommandChannel(eventEmitter: EventEmitter): void { + this.commandChannel = new FileCommandChannel(eventEmitter); + } + + public createEnvironmentInformation(envId: string, envName: string): EnvironmentInformation { + return new DlcEnvironmentInformation(envId, envName); + } + + public get getName(): string { + return 'dlc'; + } + + public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + environments.forEach(async (environment) => { + const dlcClient = (environment as DlcEnvironmentInformation).dlcClient; + if (!dlcClient) { + return Promise.reject('DLC client not initialized!'); + } + const newStatus = await dlcClient.updateStatus(environment.status); + switch (newStatus.toUpperCase()) { + case 'WAITING': + case 'QUEUED': + environment.setStatus('WAITING'); + break; + case 'RUNNING': + environment.setStatus('RUNNING'); + break; + case 'COMPLETED': + case 'SUCCEEDED': + environment.setStatus('SUCCEEDED'); + break; + case 'FAILED': + environment.setStatus('FAILED'); + return Promise.reject(`DLC: job ${environment.envId} is failed!`); + case 'STOPPED': + case 'STOPPING': + environment.setStatus('USER_CANCELED'); + break; + default: + environment.setStatus('UNKNOWN'); + } + }); + } + + public async startEnvironment(environment: EnvironmentInformation): Promise { + const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation; + const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp"); + if (!fs.existsSync(environmentLocalTempFolder)) { + await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); + } + + let script: string = environment.command; + + let dlcFolder: string = this.experimentRootDir.replace('/home/admin/workspace', '/root/data'); + const prepare = `cd ${dlcFolder} && mkdir -p envs/${environment.id} && cd envs/${environment.id} \ + && cp -r ../../environment-temp/envs/* .` + const startrun = `sh install_nni.sh && python -m nni.tools.trial_tool.trial_runner`; + + script = `${prepare} && ${startrun}`; + script = `${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \ + 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`; + + const dlcClient = new DlcClient( + this.config.type, + this.config.image, + this.config.podCount, + this.experimentId, + environment.id, + this.config.ecsSpec, + this.config.region, + this.config.nasDataSourceId, + this.config.accessKeyId, + this.config.accessKeySecret, + script, + dlcFolder, + ); + dlcEnvironment.id = await dlcClient.submit(); + this.log.debug('dlc: before getTrackingUrl'); + dlcEnvironment.trackingUrl = await dlcClient.getTrackingUrl(); + this.log.debug(`dlc trackingUrl: ${dlcEnvironment.trackingUrl}`); + dlcEnvironment.dlcClient = dlcClient; + } + + public async stopEnvironment(environment: EnvironmentInformation): Promise { + const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation; + const dlcClient = dlcEnvironment.dlcClient; + if (!dlcClient) { + throw new Error('DLC client not initialized!'); + } + dlcClient.stop(); + } +} diff --git a/ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts b/ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts index aa1bcc5e6d..01275dfc61 100644 --- a/ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts +++ b/ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts @@ -8,6 +8,7 @@ import { ExperimentConfig } from '../../../common/experimentConfig'; import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo'; import { getCustomEnvironmentServiceConfig } from '../../../common/nniConfig'; import { importModule } from '../../../common/utils'; +import { DlcEnvironmentService } from './dlcEnvironmentService'; export async function createEnvironmentService(name: string, config: ExperimentConfig): Promise { const info = ExperimentStartupInfo.getInstance(); @@ -23,6 +24,8 @@ export async function createEnvironmentService(name: string, config: ExperimentC return new OpenPaiEnvironmentService(config, info); case 'kubeflow': return new KubeflowEnvironmentService(config, info); + case 'dlc': + return new DlcEnvironmentService(config, info); } const esConfig = await getCustomEnvironmentServiceConfig(name); From 5818644ebd4e9b65a66558fd9f50cd2cdcaf5953 Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Thu, 5 Aug 2021 07:12:13 +0800 Subject: [PATCH 2/5] DLC: support file command channel used in dlcEnvironment --- examples/trials/mnist-pytorch/config_dlc.yml | 9 ++++++-- nni/experiment/config/dlc.py | 2 ++ ts/nni_manager/common/experimentConfig.ts | 2 ++ ts/nni_manager/config/dlc/dlcUtil.py | 2 +- .../environments/dlcEnvironmentService.ts | 22 +++++++++++++++---- 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index d002d20995..72c60298f3 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -1,5 +1,8 @@ +# working directory on DSW, please provie FULL path +experimentWorkingDirectory: /home/admin/workspace/{your_working_dir} searchSpaceFile: search_space.json -trialCommand: python3 mnist.py +# the command on trial runner(or, DLC container), be aware of data_dir +trialCommand: python mnist.py --data_dir /root/data/{your_data_dir} trialConcurrency: 1 maxTrialNumber: 10 tuner: @@ -10,7 +13,7 @@ tuner: trainingService: platform: dlc type: Worker - image: registry-vpc.cn-hangzhou.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04 + image: registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04 podCount: 1 ecsSpec: ecs.c6.large region: cn-hangzhou @@ -18,3 +21,5 @@ trainingService: accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} nasDataSourceId: ${your_nas_data_source_id} #e.g., datat56by9n1xt0a + localStorageMountPoint: /home/admin/workspace/ # DSW default root path, please do NOT change it + containerStorageMountPoint: /root/data/ # DLC container default root Path, change it based on your DLC NAS config. \ No newline at end of file diff --git a/nni/experiment/config/dlc.py b/nni/experiment/config/dlc.py index 7b59111fb1..d424142ff8 100644 --- a/nni/experiment/config/dlc.py +++ b/nni/experiment/config/dlc.py @@ -18,6 +18,8 @@ class DlcConfig(TrainingServiceConfig): nas_data_source_id: str access_key_id: str access_key_secret: str + localStorageMountPoint: str + containerStorageMountPoint: str _validation_rules = { 'platform': lambda value: (value == 'dlc', 'cannot be modified') diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 1ca7a5c7ac..3c6bb00cbf 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -85,6 +85,8 @@ export interface DlcConfig extends TrainingServiceConfig { nasDataSourceId: string; accessKeyId: string; accessKeySecret: string; + localStorageMountPoint: string; + containerStorageMountPoint: string; } /* Kubeflow */ diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index 8864954c76..740e1bc726 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -66,7 +66,7 @@ while True: line = sys.stdin.readline().rstrip() if line == 'update_status': - print('status:' + client.get_job(job_id).body) + print('status:' + client.get_job(job_id).body.status) elif line == 'tracking_url': #TODO: 1. get this url by api? 2. change this url in private dlc mode. print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}') diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index c7a37a75a8..5f3b0d1fd3 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -15,6 +15,9 @@ import { DlcEnvironmentInformation } from '../dlc/dlcConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; import { EventEmitter } from "events"; import { FileCommandChannel } from '../channels/fileCommandChannel'; +import { MountedStorageService } from '../storages/mountedStorageService'; +import { Scope } from 'typescript-ioc'; +import { StorageService } from '../storageService'; interface FlattenDlcConfig extends ExperimentConfig, DlcConfig { } @@ -35,6 +38,7 @@ export class DlcEnvironmentService extends EnvironmentService { this.experimentRootDir = info.logDir; this.config = flattenConfig(config, 'dlc'); validateCodeDir(this.config.trialCodeDirectory); + component.Container.bind(StorageService).to(MountedStorageService).scope(Scope.Singleton); } public get hasStorageService(): boolean { @@ -61,6 +65,8 @@ export class DlcEnvironmentService extends EnvironmentService { } const newStatus = await dlcClient.updateStatus(environment.status); switch (newStatus.toUpperCase()) { + case 'CREATING': + case 'CREATED': case 'WAITING': case 'QUEUED': environment.setStatus('WAITING'); @@ -92,12 +98,19 @@ export class DlcEnvironmentService extends EnvironmentService { await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); } + let dlcFolder: string = this.experimentRootDir.replace( + this.config.localStorageMountPoint, this.config.containerStorageMountPoint); + dlcEnvironment.workingFolder = `${this.experimentRootDir}/envs/${environment.id}`; + dlcEnvironment.runnerWorkingFolder = `${dlcFolder}/envs/${environment.id}`; let script: string = environment.command; - let dlcFolder: string = this.experimentRootDir.replace('/home/admin/workspace', '/root/data'); - const prepare = `cd ${dlcFolder} && mkdir -p envs/${environment.id} && cd envs/${environment.id} \ - && cp -r ../../environment-temp/envs/* .` - const startrun = `sh install_nni.sh && python -m nni.tools.trial_tool.trial_runner`; + // environment id dir and command dir + if (!fs.existsSync(`${dlcEnvironment.workingFolder}/commands`)) { + await fs.promises.mkdir(`${dlcEnvironment.workingFolder}/commands`, {recursive: true}); + } + + const prepare = `cd ${dlcEnvironment.runnerWorkingFolder} && cp -r ../../environment-temp/envs/* ../`; + const startrun = `sh ../install_nni.sh && python -m nni.tools.trial_tool.trial_runner`; script = `${prepare} && ${startrun}`; script = `${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \ @@ -117,6 +130,7 @@ export class DlcEnvironmentService extends EnvironmentService { script, dlcFolder, ); + dlcEnvironment.id = await dlcClient.submit(); this.log.debug('dlc: before getTrackingUrl'); dlcEnvironment.trackingUrl = await dlcClient.getTrackingUrl(); From 45eb3a5be9cc29673905df39f35c99e47f997656 Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Wed, 11 Aug 2021 04:21:31 +0800 Subject: [PATCH 3/5] DLC: remove redundant script dir param --- examples/trials/mnist-pytorch/config_dlc.yml | 2 +- nni/experiment/config/dlc.py | 4 ++-- nni/tools/nnictl/config_schema.py | 2 ++ ts/nni_manager/config/dlc/dlcUtil.py | 1 - .../training_service/reusable/dlc/dlcClient.ts | 13 +------------ .../reusable/environments/dlcEnvironmentService.ts | 5 +---- 6 files changed, 7 insertions(+), 20 deletions(-) diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index 72c60298f3..abf5b575c9 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -3,7 +3,7 @@ experimentWorkingDirectory: /home/admin/workspace/{your_working_dir} searchSpaceFile: search_space.json # the command on trial runner(or, DLC container), be aware of data_dir trialCommand: python mnist.py --data_dir /root/data/{your_data_dir} -trialConcurrency: 1 +trialConcurrency: 1 # NOTE: please provide number <= 3 due to DLC system limit. maxTrialNumber: 10 tuner: name: TPE diff --git a/nni/experiment/config/dlc.py b/nni/experiment/config/dlc.py index d424142ff8..d8bf788518 100644 --- a/nni/experiment/config/dlc.py +++ b/nni/experiment/config/dlc.py @@ -18,8 +18,8 @@ class DlcConfig(TrainingServiceConfig): nas_data_source_id: str access_key_id: str access_key_secret: str - localStorageMountPoint: str - containerStorageMountPoint: str + local_storage_mount_point: str + container_storage_mount_point: str _validation_rules = { 'platform': lambda value: (value == 'dlc', 'cannot be modified') diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index 0286b1b00a..c82a26621d 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -283,6 +283,8 @@ def validate(self, data): 'ecsSpec': setType('ecsSpec', str), 'region': setType('region', str), 'nasDataSourceId': setType('nasDataSourceId', str), + 'localStorageMountPoint': setType('localStorageMountPoint', str), + 'containerStorageMountPoint': setType('containerStorageMountPoint', str), } } diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index 740e1bc726..4f84261ab2 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -22,7 +22,6 @@ parser.add_argument('--access_key_id', help='access_key_id') parser.add_argument('--access_key_secret', help='access_key_secret') parser.add_argument('--experiment_name', help='the experiment name') - parser.add_argument('--script_dir', help='script directory') parser.add_argument('--user_command', help='user command') args = parser.parse_args() diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts index f0e5b56115..e56b3e92ab 100644 --- a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -23,7 +23,6 @@ export class DlcClient { public environmentId: string; public userCommand: string; public pythonShellClient: undefined | PythonShell; - public codeDir: string; constructor( type: string, @@ -37,7 +36,6 @@ export class DlcClient { accessKeyId: string, accessKeySecret: string, userCommand: string, - codeDir: string, ) { this.log = getLogger('DlcClient'); this.type = type; @@ -52,7 +50,6 @@ export class DlcClient { this.experimentId = experimentId; this.environmentId = environmentId; this.userCommand = userCommand; - this.codeDir = codeDir; } private getScript(): string[] { @@ -61,7 +58,7 @@ export class DlcClient { `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --pod_count ${this.podCount} ` + `--ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + `--region ${this.region} --nas_data_source_id ${this.nasDataSourceId} --access_key_id ${this.accessKeyId} ` + - `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}" --script_dir ${this.codeDir}` ); + `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}"` ); return script; } @@ -81,7 +78,6 @@ export class DlcClient { '--access_key_id', this.accessKeyId, '--access_key_secret', this.accessKeySecret, '--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`, - '--script_dir', this.codeDir, '--user_command', this.userCommand, ] }); @@ -125,9 +121,7 @@ export class DlcClient { throw Error('python shell client not initialized!'); } this.pythonShellClient.send('update_status'); - var log = this.log; this.pythonShellClient.on('message', (status: any) => { - log.debug(`updateStatus: message ${status}`); let newStatus = this.parseContent('status', status); if (newStatus === '') { newStatus = oldStatus; @@ -152,9 +146,7 @@ export class DlcClient { throw Error('python shell client not initialized!'); } this.pythonShellClient.send('receive'); - var log = this.log; this.pythonShellClient.on('message', (command: any) => { - log.debug(`message*** ${command}`); const message = this.parseContent('receive', command); if (message !== '') { deferred.resolve(JSON.parse(message)) @@ -166,13 +158,10 @@ export class DlcClient { // Monitor error information in dlc python shell client private monitorError(pythonShellClient: PythonShell, deferred: Deferred): void { - var log = this.log; pythonShellClient.on('error', function (error: any) { - log.debug(`error*** ${error}`); deferred.reject(error); }); pythonShellClient.on('close', function (error: any) { - log.debug(`close*** ${error}`); deferred.reject(error); }); } diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index 5f3b0d1fd3..a62cc17f86 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -9,7 +9,6 @@ import * as component from '../../../common/component'; import { getLogger, Logger } from '../../../common/log'; import { ExperimentConfig, DlcConfig, flattenConfig } from '../../../common/experimentConfig'; import { ExperimentStartupInfo } from '../../../common/experimentStartupInfo'; -import { validateCodeDir } from '../../common/util'; import { DlcClient } from '../dlc/dlcClient'; import { DlcEnvironmentInformation } from '../dlc/dlcConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; @@ -37,7 +36,6 @@ export class DlcEnvironmentService extends EnvironmentService { this.experimentId = info.experimentId; this.experimentRootDir = info.logDir; this.config = flattenConfig(config, 'dlc'); - validateCodeDir(this.config.trialCodeDirectory); component.Container.bind(StorageService).to(MountedStorageService).scope(Scope.Singleton); } @@ -98,7 +96,7 @@ export class DlcEnvironmentService extends EnvironmentService { await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); } - let dlcFolder: string = this.experimentRootDir.replace( + const dlcFolder: string = this.experimentRootDir.replace( this.config.localStorageMountPoint, this.config.containerStorageMountPoint); dlcEnvironment.workingFolder = `${this.experimentRootDir}/envs/${environment.id}`; dlcEnvironment.runnerWorkingFolder = `${dlcFolder}/envs/${environment.id}`; @@ -128,7 +126,6 @@ export class DlcEnvironmentService extends EnvironmentService { this.config.accessKeyId, this.config.accessKeySecret, script, - dlcFolder, ); dlcEnvironment.id = await dlcClient.submit(); From b30c1a23707b0c118d3f431ed5d7aa2c82a7058d Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Fri, 13 Aug 2021 07:57:48 +0800 Subject: [PATCH 4/5] DLC: add document && add job_type in config --- docs/en_US/TrainingService/DLCMode.rst | 83 +++++++++++++++++++ examples/trials/mnist-pytorch/config_dlc.yml | 7 +- nni/experiment/config/dlc.py | 1 + nni/tools/nnictl/config_schema.py | 1 + ts/nni_manager/common/experimentConfig.ts | 1 + ts/nni_manager/config/dlc/dlcUtil.py | 10 +-- .../reusable/dlc/dlcClient.ts | 10 ++- .../environments/dlcEnvironmentService.ts | 1 + 8 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 docs/en_US/TrainingService/DLCMode.rst diff --git a/docs/en_US/TrainingService/DLCMode.rst b/docs/en_US/TrainingService/DLCMode.rst new file mode 100644 index 0000000000..9e3a44c1f7 --- /dev/null +++ b/docs/en_US/TrainingService/DLCMode.rst @@ -0,0 +1,83 @@ +**Run an Experiment on Aliyun PAI-DSW + PAI-DLC** +=================================================== + +NNI supports running an experiment on `PAI-DSW `__ , submit trials to `PAI-DLC `__ called dlc mode. + +PAI-DSW server performs the role to submit a job while PAI-DLC is where the training job runs. + +Setup environment +----------------- + +Step 1. Install NNI, follow the install guide `here <../Tutorial/QuickStart.rst>`__. + +Step 2. Create PAI-DSW server following this `link `__. Note as the training service will be run on PAI-DLC, it won't cost many resources to run and you may just need a PAI-DSW server with CPU. + +Step 3. Open PAI-DLC `here `__, select the same region as your PAI-DSW server. Move to ``dataset configuration`` and mount the same NAS disk as the PAI-DSW server does. (Note currently only PAI-DLC public-cluster is supported.) + +Step 4. Open your PAI-DSW server command line, download and install PAI-DLC python SDK to submit DLC tasks, refer to `this link `__. + + +.. code-block:: bash + + wget https://sdk-portal-cluster-prod.oss-cn-zhangjiakou.aliyuncs.com/downloads/u-3536038a-3de7-4f2e-9379-0cb309d29355-python-pai-dlc.zip + unzip u-3536038a-3de7-4f2e-9379-0cb309d29355-python-pai-dlc.zip + pip install ./pai-dlc-20201203 # pai-dlc-20201203 refer to unzipped sdk file name, replace it accordingly. + + +Run an experiment +----------------- + +Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's content is like: + +.. code-block:: yaml + + # working directory on DSW, please provie FULL path + experimentWorkingDirectory: /home/admin/workspace/{your_working_dir} + searchSpaceFile: search_space.json + # the command on trial runner(or, DLC container), be aware of data_dir + trialCommand: python mnist.py --data_dir /root/data/{your_data_dir} + trialConcurrency: 1 # NOTE: please provide number <= 3 due to DLC system limit. + maxTrialNumber: 10 + tuner: + name: TPE + classArgs: + optimize_mode: maximize + # ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x + trainingService: + platform: dlc + type: Worker + image: registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04 + jobType: PyTorchJob # choices: [TFJob, PyTorchJob] + podCount: 1 + ecsSpec: ecs.c6.large + region: cn-hangzhou + nasDataSourceId: ${your_nas_data_source_id} + accessKeyId: ${your_ak_id} + accessKeySecret: ${your_ak_key} + nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a + localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW + containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting + +Note: You should set ``platform: dlc`` in NNI config YAML file if you want to start experiment in dlc mode. + +Compared with `LocalMode `__ training service configuration in dlc mode have these additional keys like ``type/image/jobType/podCount/ecsSpec/region/nasDataSourceId/accessKeyId/accessKeySecret``, for detailed explanation ref to this `link `__. + +Also, as dlc mode requires DSW/DLC to mount the same NAS disk to share information, there are two extra keys related to this: ``localStorageMountPoint`` and ``containerStorageMountPoint``. + +Run the following commands to start the example experiment: + +.. code-block:: bash + + git clone -b ${NNI_VERSION} https://github.com/microsoft/nni + cd nni/examples/trials/mnist-pytorch + + # modify config_dlc.yml ... + + nnictl create --config config_dlc.yml + +Replace ``${NNI_VERSION}`` with a released version name or branch name, e.g., ``v2.3``. + +Monitor your job +-------------------------------------------------- + +To monitor your job on DLC, you need to visit `DLC `__ to check job status. diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index abf5b575c9..f12ad170fe 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -14,12 +14,13 @@ trainingService: platform: dlc type: Worker image: registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04 + jobType: PyTorchJob # choices: [TFJob, PyTorchJob] podCount: 1 ecsSpec: ecs.c6.large region: cn-hangzhou nasDataSourceId: ${your_nas_data_source_id} accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} - nasDataSourceId: ${your_nas_data_source_id} #e.g., datat56by9n1xt0a - localStorageMountPoint: /home/admin/workspace/ # DSW default root path, please do NOT change it - containerStorageMountPoint: /root/data/ # DLC container default root Path, change it based on your DLC NAS config. \ No newline at end of file + nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a + localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW + containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting diff --git a/nni/experiment/config/dlc.py b/nni/experiment/config/dlc.py index d8bf788518..2b70e789d9 100644 --- a/nni/experiment/config/dlc.py +++ b/nni/experiment/config/dlc.py @@ -12,6 +12,7 @@ class DlcConfig(TrainingServiceConfig): platform: str = 'dlc' type: str = 'Worker' image: str # 'registry-vpc.{region}.aliyuncs.com/pai-dlc/tensorflow-training:1.15.0-cpu-py36-ubuntu18.04', + job_type: str = 'TFJob' pod_count: int ecs_spec: str # e.g.,'ecs.c6.large' region: str diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index c82a26621d..15aad76e5a 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -279,6 +279,7 @@ def validate(self, data): Optional('dlcConfig'): { 'type': setType('type', str), 'image': setType('image', str), + 'jobType': setType('jobType', str), 'podCount': setType('podCount', int), 'ecsSpec': setType('ecsSpec', str), 'region': setType('region', str), diff --git a/ts/nni_manager/common/experimentConfig.ts b/ts/nni_manager/common/experimentConfig.ts index 3c6bb00cbf..d49c8c70e7 100644 --- a/ts/nni_manager/common/experimentConfig.ts +++ b/ts/nni_manager/common/experimentConfig.ts @@ -79,6 +79,7 @@ export interface DlcConfig extends TrainingServiceConfig { platfrom: 'dlc'; type: string; image: string; + jobType: string; podCount: number; ecsSpec: string; region: string; diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index 4f84261ab2..c046463bb1 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -13,8 +13,9 @@ if __name__ == "__main__": parser = ArgumentParser() - parser.add_argument('--type', help='the type of pod') + parser.add_argument('--type', help='the type of job spec') parser.add_argument('--image', help='the docker image of job') + parser.add_argument('--job_type', choices=['TFJob', 'PyTorchJob'], help='the job type') parser.add_argument('--pod_count', type=int, default=1, help='pod count') parser.add_argument('--ecs_spec', help='ecs spec') parser.add_argument('--region', help='region') @@ -42,16 +43,15 @@ # job spec spec = JobSpec( - type='Worker', + type=args.type, image=args.image, pod_count=args.pod_count, ecs_spec=args.ecs_spec, ) - job_type = 'TFJob' if args.image.find('tensorflow') >= 0 else 'PyTorchJob' req = CreateJobRequest( display_name=args.experiment_name, - job_type=job_type, + job_type=args.job_type, job_specs=[spec], data_sources=[nas_1], user_command=args.user_command @@ -70,7 +70,7 @@ #TODO: 1. get this url by api? 2. change this url in private dlc mode. print('tracking_url:' + f'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId={job_id}®ionId={args.region}') elif line == 'stop': - client.stop_job() + client.stop_job(job_id) exit(0) elif line == 'receive': print('receive:' + json.dumps(run.get_metrics())) diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts index e56b3e92ab..180dd40cd8 100644 --- a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -11,6 +11,7 @@ export class DlcClient { private log: Logger; public type: string; public image: string; + public jobType: string; public podCount: number; public ecsSpec: string; public region: string; @@ -27,6 +28,7 @@ export class DlcClient { constructor( type: string, image: string, + jobType: string, podCount: number, experimentId: string, environmentId: string, @@ -40,6 +42,7 @@ export class DlcClient { this.log = getLogger('DlcClient'); this.type = type; this.image = image; + this.jobType = jobType; this.podCount = podCount; this.ecsSpec = ecsSpec; this.image = image; @@ -55,8 +58,8 @@ export class DlcClient { private getScript(): string[] { const script: string[] = []; script.push( - `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --pod_count ${this.podCount} ` + - `--ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + + `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --job_type ${this.jobType} ` + + `--pod_count ${this.podCount} --ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + `--region ${this.region} --nas_data_source_id ${this.nasDataSourceId} --access_key_id ${this.accessKeyId} ` + `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}"` ); return script; @@ -71,6 +74,7 @@ export class DlcClient { args: [ '--type', this.type, '--image', this.image, + '--job_type', this.jobType, '--pod_count', String(this.podCount), '--ecs_spec', this.ecsSpec, '--region', this.region, @@ -88,8 +92,6 @@ export class DlcClient { }); this.monitorError(this.pythonShellClient, deferred); return deferred.promise; - - return deferred.promise; } public stop(): void { diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index a62cc17f86..08add0b68e 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -117,6 +117,7 @@ export class DlcEnvironmentService extends EnvironmentService { const dlcClient = new DlcClient( this.config.type, this.config.image, + this.config.jobType, this.config.podCount, this.experimentId, environment.id, From 891994ff7f7e8f7f35bc4ac3f0527992c63ab08f Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Sat, 14 Aug 2021 06:57:48 +0800 Subject: [PATCH 5/5] DLC: use storage service & update doc --- docs/en_US/TrainingService/DLCMode.rst | 6 +- docs/en_US/TrainingService/Overview.rst | 6 +- docs/en_US/reference/experiment_config.rst | 106 ++++++++++++++++++ docs/en_US/training_services.rst | 1 + examples/trials/mnist-pytorch/config_dlc.yml | 3 +- nni/tools/nnictl/config_schema.py | 25 +---- ts/nni_manager/config/dlc/dlcUtil.py | 6 - .../reusable/dlc/dlcClient.ts | 16 +-- .../environments/dlcEnvironmentService.ts | 33 +++--- 9 files changed, 132 insertions(+), 70 deletions(-) diff --git a/docs/en_US/TrainingService/DLCMode.rst b/docs/en_US/TrainingService/DLCMode.rst index 9e3a44c1f7..9dc9e3443c 100644 --- a/docs/en_US/TrainingService/DLCMode.rst +++ b/docs/en_US/TrainingService/DLCMode.rst @@ -12,9 +12,9 @@ Step 1. Install NNI, follow the install guide `here <../Tutorial/QuickStart.rst> Step 2. Create PAI-DSW server following this `link `__. Note as the training service will be run on PAI-DLC, it won't cost many resources to run and you may just need a PAI-DSW server with CPU. -Step 3. Open PAI-DLC `here `__, select the same region as your PAI-DSW server. Move to ``dataset configuration`` and mount the same NAS disk as the PAI-DSW server does. (Note currently only PAI-DLC public-cluster is supported.) +Step 3. Open PAI-DLC `here `__, select the same region as your PAI-DSW server. Move to ``dataset configuration`` and mount the same NAS disk as the PAI-DSW server does. (Note currently only PAI-DLC public-cluster is supported.) -Step 4. Open your PAI-DSW server command line, download and install PAI-DLC python SDK to submit DLC tasks, refer to `this link `__. +Step 4. Open your PAI-DSW server command line, download and install PAI-DLC python SDK to submit DLC tasks, refer to `this link `__. Skip this step if SDK is already installed. .. code-block:: bash @@ -78,6 +78,6 @@ Run the following commands to start the example experiment: Replace ``${NNI_VERSION}`` with a released version name or branch name, e.g., ``v2.3``. Monitor your job --------------------------------------------------- +---------------- To monitor your job on DLC, you need to visit `DLC `__ to check job status. diff --git a/docs/en_US/TrainingService/Overview.rst b/docs/en_US/TrainingService/Overview.rst index 5f0727e8dd..6a4b5e91c1 100644 --- a/docs/en_US/TrainingService/Overview.rst +++ b/docs/en_US/TrainingService/Overview.rst @@ -6,7 +6,7 @@ What is Training Service? NNI training service is designed to allow users to focus on AutoML itself, agnostic to the underlying computing infrastructure where the trials are actually run. When migrating from one cluster to another (e.g., local machine to Kubeflow), users only need to tweak several configurations, and the experiment can be easily scaled. -Users can use training service provided by NNI, to run trial jobs on `local machine <./LocalMode.rst>`__\ , `remote machines <./RemoteMachineMode.rst>`__\ , and on clusters like `PAI <./PaiMode.rst>`__\ , `Kubeflow <./KubeflowMode.rst>`__\ , `AdaptDL <./AdaptDLMode.rst>`__\ , `FrameworkController <./FrameworkControllerMode.rst>`__\ , `DLTS <./DLTSMode.rst>`__ and `AML <./AMLMode.rst>`__. These are called *built-in training services*. +Users can use training service provided by NNI, to run trial jobs on `local machine <./LocalMode.rst>`__\ , `remote machines <./RemoteMachineMode.rst>`__\ , and on clusters like `PAI <./PaiMode.rst>`__\ , `Kubeflow <./KubeflowMode.rst>`__\ , `AdaptDL <./AdaptDLMode.rst>`__\ , `FrameworkController <./FrameworkControllerMode.rst>`__\ , `DLTS <./DLTSMode.rst>`__, `AML <./AMLMode.rst>`__ and `DLC <./DLCMode.rst>`__. These are called *built-in training services*. If the computing resource customers try to use is not listed above, NNI provides interface that allows users to build their own training service easily. Please refer to `how to implement training service <./HowToImplementTrainingService.rst>`__ for details. @@ -44,6 +44,8 @@ Built-in Training Services - NNI supports running experiment using `DLTS `__\ , which is an open source toolkit, developed by Microsoft, that allows AI scientists to spin up an AI cluster in turn-key fashion. * - `AML <./AMLMode.rst>`__ - NNI supports running an experiment on `AML `__ , called aml mode. + * - `DLC <./DLCMode.rst>`__ + - NNI supports running an experiment on `PAI-DLC `__ , called dlc mode. What does Training Service do? @@ -77,4 +79,4 @@ When reuse mode is enabled, a cluster, such as a remote machine or a computer in In the reuse mode, user needs to make sure each trial can run independently in the same job (e.g., avoid loading checkpoints from previous trials). -.. note:: Currently, only `Local <./LocalMode.rst>`__, `Remote <./RemoteMachineMode.rst>`__, `OpenPAI <./PaiMode.rst>`__ and `AML <./AMLMode.rst>`__ training services support resue mode. For Remote and OpenPAI training platforms, you can enable reuse mode according to `here <../reference/experiment_config.rst>`__ manually. AML is implemented under reuse mode, so the default mode is reuse mode, no need to manually enable. +.. note:: Currently, only `Local <./LocalMode.rst>`__, `Remote <./RemoteMachineMode.rst>`__, `OpenPAI <./PaiMode.rst>`__, `AML <./AMLMode.rst>`__ and `DLC <./DLCMode.rst>`__ training services support resue mode. For Remote and OpenPAI training platforms, you can enable reuse mode according to `here <../reference/experiment_config.rst>`__ manually. AML is implemented under reuse mode, so the default mode is reuse mode, no need to manually enable. diff --git a/docs/en_US/reference/experiment_config.rst b/docs/en_US/reference/experiment_config.rst index fc6dfb79dd..472d5c4793 100644 --- a/docs/en_US/reference/experiment_config.rst +++ b/docs/en_US/reference/experiment_config.rst @@ -409,6 +409,7 @@ One of the following: - `RemoteConfig`_ - :ref:`OpenpaiConfig ` - `AmlConfig`_ +- `DlcConfig`_ - `HybridConfig`_ For `Kubeflow <../TrainingService/KubeflowMode.rst>`_, `FrameworkController <../TrainingService/FrameworkControllerMode.rst>`_, and `AdaptDL <../TrainingService/AdaptDLMode.rst>`_ training platforms, it is suggested to use `v1 config schema <../Tutorial/ExperimentConfig.rst>`_ for now. @@ -797,6 +798,111 @@ AML compute cluster name. type: ``str`` +DlcConfig +--------- + +Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__. + + +platform +"""""""" + +Constant string ``"dlc"``. + + +type +"""" + +Job spec type. + +type: ``str`` + +default: ``"worker"`` + + +image +""""" + +Name and tag of docker image to run the trials. + +type: ``str`` + + +jobType +""""""" + +PAI-DLC training job type, ``"TFJob"`` or ``"PyTorchJob"``. + +type: ``str`` + + +podCount +"""""""" + +Pod count to run a single training job. + +type: ``str`` + + +ecsSpec +""""""" + +Training server config spec string. + +type: ``str`` + + +region +"""""" + +The region where PAI-DLC public-cluster locates. + +type: ``str`` + + +nasDataSourceId +""""""""""""""" + +The NAS datasource id configurated in PAI-DLC side. + +type: ``str`` + + + +accessKeyId +""""""""""" + +The accessKeyId of your cloud account. + +type: ``str`` + + + +accessKeySecret +""""""""""""""" + +The accessKeySecret of your cloud account. + +type: ``str`` + + + +localStorageMountPoint +"""""""""""""""""""""" + +The mount point of the NAS on PAI-DSW server, default is /home/admin/workspace/. + +type: ``str`` + + +containerStorageMountPoint +"""""""""""""""""""""""""" + +The mount point of the NAS on PAI-DLC side, default is /root/data/. + +type: ``str`` + + HybridConfig ------------ diff --git a/docs/en_US/training_services.rst b/docs/en_US/training_services.rst index 4814b29835..ab8f79fbd7 100644 --- a/docs/en_US/training_services.rst +++ b/docs/en_US/training_services.rst @@ -11,4 +11,5 @@ Introduction to NNI Training Services FrameworkController<./TrainingService/FrameworkControllerMode> DLTS<./TrainingService/DLTSMode> AML<./TrainingService/AMLMode> + PAI-DLC<./TrainingService/DLCMode> Hybrid<./TrainingService/HybridMode> diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index f12ad170fe..d4372acad4 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -1,5 +1,4 @@ # working directory on DSW, please provie FULL path -experimentWorkingDirectory: /home/admin/workspace/{your_working_dir} searchSpaceFile: search_space.json # the command on trial runner(or, DLC container), be aware of data_dir trialCommand: python mnist.py --data_dir /root/data/{your_data_dir} @@ -22,5 +21,5 @@ trainingService: accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a - localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW + localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path. containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting diff --git a/nni/tools/nnictl/config_schema.py b/nni/tools/nnictl/config_schema.py index 15aad76e5a..ae496f2600 100644 --- a/nni/tools/nnictl/config_schema.py +++ b/nni/tools/nnictl/config_schema.py @@ -130,7 +130,7 @@ def validate(self, data): Optional('maxTrialDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxTrialDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice( - 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid', 'dlc'), + 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'dlts', 'aml', 'adl', 'hybrid'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), Optional('multiPhase'): setType('multiPhase', bool), Optional('multiThread'): setType('multiThread', bool), @@ -267,28 +267,6 @@ def validate(self, data): } } -dlc_trial_schema = { - 'trial': { - 'codeDir': setPathCheck('codeDir'), - 'command': setType('command', str), - 'image': setType('image', str), - } -} - -dlc_config_schema = { - Optional('dlcConfig'): { - 'type': setType('type', str), - 'image': setType('image', str), - 'jobType': setType('jobType', str), - 'podCount': setType('podCount', int), - 'ecsSpec': setType('ecsSpec', str), - 'region': setType('region', str), - 'nasDataSourceId': setType('nasDataSourceId', str), - 'localStorageMountPoint': setType('localStorageMountPoint', str), - 'containerStorageMountPoint': setType('containerStorageMountPoint', str), - } -} - hybrid_trial_schema = { 'trial': { 'codeDir': setPathCheck('codeDir'), @@ -499,7 +477,6 @@ def validate(self, data): 'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}), 'hybrid': Schema({**common_schema, **hybrid_trial_schema, **hybrid_config_schema, **machine_list_schema, **pai_config_schema, **aml_config_schema, **remote_config_schema}), - 'dlc': Schema({**common_schema, **dlc_trial_schema, **dlc_config_schema}), } diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index c046463bb1..333fc5e078 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -72,9 +72,3 @@ elif line == 'stop': client.stop_job(job_id) exit(0) - elif line == 'receive': - print('receive:' + json.dumps(run.get_metrics())) - elif line: - items = line.split(':') - if items[0] == 'command': - run.log('nni_manager', line[8:]) diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts index 180dd40cd8..9ba3960dc4 100644 --- a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -55,21 +55,11 @@ export class DlcClient { this.userCommand = userCommand; } - private getScript(): string[] { - const script: string[] = []; - script.push( - `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --job_type ${this.jobType} ` + - `--pod_count ${this.podCount} --ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + - `--region ${this.region} --nas_data_source_id ${this.nasDataSourceId} --access_key_id ${this.accessKeyId} ` + - `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}"` ); - return script; - } - public submit(): Promise { const deferred: Deferred = new Deferred(); this.pythonShellClient = new PythonShell('dlcUtil.py', { scriptPath: './config/dlc', - pythonPath: 'python', + pythonPath: 'python3', pythonOptions: ['-u'], // get print results in real-time args: [ '--type', this.type, @@ -142,8 +132,8 @@ export class DlcClient { this.pythonShellClient.send(`command:${message}`); } - public receiveCommand(): Promise { - const deferred: Deferred = new Deferred(); + public receiveCommand(): Promise { + const deferred: Deferred = new Deferred(); if (this.pythonShellClient === undefined) { throw Error('python shell client not initialized!'); } diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index 08add0b68e..f7012be644 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -28,19 +28,21 @@ export class DlcEnvironmentService extends EnvironmentService { private readonly log: Logger = getLogger('dlcEnvironmentService'); private experimentId: string; - private experimentRootDir: string; private config: FlattenDlcConfig; constructor(config: ExperimentConfig, info: ExperimentStartupInfo) { super(); this.experimentId = info.experimentId; - this.experimentRootDir = info.logDir; this.config = flattenConfig(config, 'dlc'); component.Container.bind(StorageService).to(MountedStorageService).scope(Scope.Singleton); + const storageService = component.get(StorageService) + const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments', this.experimentId); + const localRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments'); + storageService.initialize(localRoot, remoteRoot); } public get hasStorageService(): boolean { - return false; + return true; } public initCommandChannel(eventEmitter: EventEmitter): void { @@ -91,28 +93,19 @@ export class DlcEnvironmentService extends EnvironmentService { public async startEnvironment(environment: EnvironmentInformation): Promise { const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation; - const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp"); - if (!fs.existsSync(environmentLocalTempFolder)) { - await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); - } - const dlcFolder: string = this.experimentRootDir.replace( - this.config.localStorageMountPoint, this.config.containerStorageMountPoint); - dlcEnvironment.workingFolder = `${this.experimentRootDir}/envs/${environment.id}`; - dlcEnvironment.runnerWorkingFolder = `${dlcFolder}/envs/${environment.id}`; - let script: string = environment.command; + const environmentRoot = path.join(this.config.containerStorageMountPoint, `/nni-experiments/${this.experimentId}`); + const localRoot = path.join(this.config.localStorageMountPoint, `/nni-experiments/${this.experimentId}`); + + dlcEnvironment.workingFolder = `${localRoot}/envs/${environment.id}`; + dlcEnvironment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`; - // environment id dir and command dir + // environment id dir and command dir, folder created on DLC side can't be accessed on DSW. if (!fs.existsSync(`${dlcEnvironment.workingFolder}/commands`)) { await fs.promises.mkdir(`${dlcEnvironment.workingFolder}/commands`, {recursive: true}); } - const prepare = `cd ${dlcEnvironment.runnerWorkingFolder} && cp -r ../../environment-temp/envs/* ../`; - const startrun = `sh ../install_nni.sh && python -m nni.tools.trial_tool.trial_runner`; - - script = `${prepare} && ${startrun}`; - script = `${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \ - 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`; + environment.command = `cd ${environmentRoot} && ${environment.command} 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`; const dlcClient = new DlcClient( this.config.type, @@ -126,7 +119,7 @@ export class DlcEnvironmentService extends EnvironmentService { this.config.nasDataSourceId, this.config.accessKeyId, this.config.accessKeySecret, - script, + environment.command, ); dlcEnvironment.id = await dlcClient.submit();