From 916a91ee6d28d67b94e08dc85dda53328f87d6aa Mon Sep 17 00:00:00 2001 From: "weidan.kong" Date: Sat, 14 Aug 2021 06:57:48 +0800 Subject: [PATCH] DLC: use storage service & update doc --- docs/en_US/TrainingService/DLCMode.rst | 4 +- docs/en_US/TrainingService/Overview.rst | 6 +- docs/en_US/reference/experiment_config.rst | 108 ++++++++++++++++++ examples/trials/mnist-pytorch/config_dlc.yml | 3 +- ts/nni_manager/config/dlc/dlcUtil.py | 6 - .../reusable/dlc/dlcClient.ts | 4 +- .../environments/dlcEnvironmentService.ts | 34 +++--- 7 files changed, 131 insertions(+), 34 deletions(-) diff --git a/docs/en_US/TrainingService/DLCMode.rst b/docs/en_US/TrainingService/DLCMode.rst index 9e3a44c1f77..bb3affae27e 100644 --- a/docs/en_US/TrainingService/DLCMode.rst +++ b/docs/en_US/TrainingService/DLCMode.rst @@ -14,7 +14,7 @@ Step 2. Create PAI-DSW server following this `link `__, select the same region as your PAI-DSW server. Move to ``dataset configuration`` and mount the same NAS disk as the PAI-DSW server does. (Note currently only PAI-DLC public-cluster is supported.) -Step 4. Open your PAI-DSW server command line, download and install PAI-DLC python SDK to submit DLC tasks, refer to `this link `__. +Step 4. Open your PAI-DSW server command line, download and install PAI-DLC python SDK to submit DLC tasks, refer to `this link `__. Skip this step if SDK is already installed. .. code-block:: bash @@ -78,6 +78,6 @@ Run the following commands to start the example experiment: Replace ``${NNI_VERSION}`` with a released version name or branch name, e.g., ``v2.3``. Monitor your job --------------------------------------------------- +---------------- To monitor your job on DLC, you need to visit `DLC `__ to check job status. diff --git a/docs/en_US/TrainingService/Overview.rst b/docs/en_US/TrainingService/Overview.rst index 5f0727e8dd2..6a4b5e91c16 100644 --- a/docs/en_US/TrainingService/Overview.rst +++ b/docs/en_US/TrainingService/Overview.rst @@ -6,7 +6,7 @@ What is Training Service? NNI training service is designed to allow users to focus on AutoML itself, agnostic to the underlying computing infrastructure where the trials are actually run. When migrating from one cluster to another (e.g., local machine to Kubeflow), users only need to tweak several configurations, and the experiment can be easily scaled. -Users can use training service provided by NNI, to run trial jobs on `local machine <./LocalMode.rst>`__\ , `remote machines <./RemoteMachineMode.rst>`__\ , and on clusters like `PAI <./PaiMode.rst>`__\ , `Kubeflow <./KubeflowMode.rst>`__\ , `AdaptDL <./AdaptDLMode.rst>`__\ , `FrameworkController <./FrameworkControllerMode.rst>`__\ , `DLTS <./DLTSMode.rst>`__ and `AML <./AMLMode.rst>`__. These are called *built-in training services*. +Users can use training service provided by NNI, to run trial jobs on `local machine <./LocalMode.rst>`__\ , `remote machines <./RemoteMachineMode.rst>`__\ , and on clusters like `PAI <./PaiMode.rst>`__\ , `Kubeflow <./KubeflowMode.rst>`__\ , `AdaptDL <./AdaptDLMode.rst>`__\ , `FrameworkController <./FrameworkControllerMode.rst>`__\ , `DLTS <./DLTSMode.rst>`__, `AML <./AMLMode.rst>`__ and `DLC <./DLCMode.rst>`__. These are called *built-in training services*. If the computing resource customers try to use is not listed above, NNI provides interface that allows users to build their own training service easily. Please refer to `how to implement training service <./HowToImplementTrainingService.rst>`__ for details. @@ -44,6 +44,8 @@ Built-in Training Services - NNI supports running experiment using `DLTS `__\ , which is an open source toolkit, developed by Microsoft, that allows AI scientists to spin up an AI cluster in turn-key fashion. * - `AML <./AMLMode.rst>`__ - NNI supports running an experiment on `AML `__ , called aml mode. + * - `DLC <./DLCMode.rst>`__ + - NNI supports running an experiment on `PAI-DLC `__ , called dlc mode. What does Training Service do? @@ -77,4 +79,4 @@ When reuse mode is enabled, a cluster, such as a remote machine or a computer in In the reuse mode, user needs to make sure each trial can run independently in the same job (e.g., avoid loading checkpoints from previous trials). -.. note:: Currently, only `Local <./LocalMode.rst>`__, `Remote <./RemoteMachineMode.rst>`__, `OpenPAI <./PaiMode.rst>`__ and `AML <./AMLMode.rst>`__ training services support resue mode. For Remote and OpenPAI training platforms, you can enable reuse mode according to `here <../reference/experiment_config.rst>`__ manually. AML is implemented under reuse mode, so the default mode is reuse mode, no need to manually enable. +.. note:: Currently, only `Local <./LocalMode.rst>`__, `Remote <./RemoteMachineMode.rst>`__, `OpenPAI <./PaiMode.rst>`__, `AML <./AMLMode.rst>`__ and `DLC <./DLCMode.rst>`__ training services support resue mode. For Remote and OpenPAI training platforms, you can enable reuse mode according to `here <../reference/experiment_config.rst>`__ manually. AML is implemented under reuse mode, so the default mode is reuse mode, no need to manually enable. diff --git a/docs/en_US/reference/experiment_config.rst b/docs/en_US/reference/experiment_config.rst index fc6dfb79dd1..0a98fc14e11 100644 --- a/docs/en_US/reference/experiment_config.rst +++ b/docs/en_US/reference/experiment_config.rst @@ -409,6 +409,7 @@ One of the following: - `RemoteConfig`_ - :ref:`OpenpaiConfig ` - `AmlConfig`_ +- `DlcConfig`_ - `HybridConfig`_ For `Kubeflow <../TrainingService/KubeflowMode.rst>`_, `FrameworkController <../TrainingService/FrameworkControllerMode.rst>`_, and `AdaptDL <../TrainingService/AdaptDLMode.rst>`_ training platforms, it is suggested to use `v1 config schema <../Tutorial/ExperimentConfig.rst>`_ for now. @@ -797,6 +798,111 @@ AML compute cluster name. type: ``str`` +DlcConfig +--------- + +Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__. + + +platform +"""""""" + +Constant string ``"dlc"``. + + +type +"""" + +Job spec type. + +type: ``str`` + +default: ``"worker"`` + + +image +""""" + +Name and tag of docker image to run the trials. + +type: ``str`` + + +jobType +""""""" + +PAI-DLC training job type, ``"TFJob"`` or ``"PyTorchJob"``. + +type: ``str`` + + +podCount +"""""""" + +Pod count to run a single training job. + +type: ``str`` + + +ecsSpec +""""""" + +Training server config spec string. + +type: ``str`` + + +region +"""""" + +The region where PAI-DLC public-cluster locates. + +type: ``str`` + + +nasDataSourceId +""""""""""""""" + +The NAS datasource id configurated in PAI-DLC side. + +type: ``str`` + + + +accessKeyId +""""""""""" + +The accessKeyId of your cloud account. + +type: ``str`` + + + +accessKeySecret +""""""""""""""" + +The accessKeySecret of your cloud account. + +type: ``str`` + + + +localStorageMountPoint +"""""""""""""""""""""" + +The mount point of the NAS on PAI-DSW server, default is /home/admin/workspace/. + +type: ``str`` + + +containerStorageMountPoint +"""""""""""""""""""""""""" + +The mount point of the NAS on PAI-DLC side, default is /root/data/. + +type: ``str`` + + HybridConfig ------------ @@ -933,3 +1039,5 @@ containerName AzureBlob container name. type: ``str`` + +ion \ No newline at end of file diff --git a/examples/trials/mnist-pytorch/config_dlc.yml b/examples/trials/mnist-pytorch/config_dlc.yml index f12ad170fe2..d4372acad48 100644 --- a/examples/trials/mnist-pytorch/config_dlc.yml +++ b/examples/trials/mnist-pytorch/config_dlc.yml @@ -1,5 +1,4 @@ # working directory on DSW, please provie FULL path -experimentWorkingDirectory: /home/admin/workspace/{your_working_dir} searchSpaceFile: search_space.json # the command on trial runner(or, DLC container), be aware of data_dir trialCommand: python mnist.py --data_dir /root/data/{your_data_dir} @@ -22,5 +21,5 @@ trainingService: accessKeyId: ${your_ak_id} accessKeySecret: ${your_ak_key} nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a - localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW + localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path. containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting diff --git a/ts/nni_manager/config/dlc/dlcUtil.py b/ts/nni_manager/config/dlc/dlcUtil.py index c046463bb1a..333fc5e0788 100644 --- a/ts/nni_manager/config/dlc/dlcUtil.py +++ b/ts/nni_manager/config/dlc/dlcUtil.py @@ -72,9 +72,3 @@ elif line == 'stop': client.stop_job(job_id) exit(0) - elif line == 'receive': - print('receive:' + json.dumps(run.get_metrics())) - elif line: - items = line.split(':') - if items[0] == 'command': - run.log('nni_manager', line[8:]) diff --git a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts index 180dd40cd81..d7b655b7495 100644 --- a/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts +++ b/ts/nni_manager/training_service/reusable/dlc/dlcClient.ts @@ -58,7 +58,7 @@ export class DlcClient { private getScript(): string[] { const script: string[] = []; script.push( - `python ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --job_type ${this.jobType} ` + + `python3 ./config/dlc/dlcUtil.py --type ${this.type} --image ${this.image} --job_type ${this.jobType} ` + `--pod_count ${this.podCount} --ecs_spec ${this.ecsSpec} --experiment_name nni_exp_${this.experimentId} ` + `--region ${this.region} --nas_data_source_id ${this.nasDataSourceId} --access_key_id ${this.accessKeyId} ` + `--access_key_secret ${this.accessKeySecret} --user_command "${this.userCommand}"` ); @@ -69,7 +69,7 @@ export class DlcClient { const deferred: Deferred = new Deferred(); this.pythonShellClient = new PythonShell('dlcUtil.py', { scriptPath: './config/dlc', - pythonPath: 'python', + pythonPath: 'python3', pythonOptions: ['-u'], // get print results in real-time args: [ '--type', this.type, diff --git a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts index 08add0b68e3..dd434c8bed4 100644 --- a/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts @@ -28,19 +28,21 @@ export class DlcEnvironmentService extends EnvironmentService { private readonly log: Logger = getLogger('dlcEnvironmentService'); private experimentId: string; - private experimentRootDir: string; private config: FlattenDlcConfig; constructor(config: ExperimentConfig, info: ExperimentStartupInfo) { super(); this.experimentId = info.experimentId; - this.experimentRootDir = info.logDir; this.config = flattenConfig(config, 'dlc'); component.Container.bind(StorageService).to(MountedStorageService).scope(Scope.Singleton); + const storageService = component.get(StorageService) + const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments', this.experimentId); + const localRoot = storageService.joinPath(this.config.localStorageMountPoint, 'nni-experiments'); + storageService.initialize(localRoot, remoteRoot); } public get hasStorageService(): boolean { - return false; + return true; } public initCommandChannel(eventEmitter: EventEmitter): void { @@ -91,28 +93,20 @@ export class DlcEnvironmentService extends EnvironmentService { public async startEnvironment(environment: EnvironmentInformation): Promise { const dlcEnvironment: DlcEnvironmentInformation = environment as DlcEnvironmentInformation; - const environmentLocalTempFolder = path.join(this.experimentRootDir, "environment-temp"); - if (!fs.existsSync(environmentLocalTempFolder)) { - await fs.promises.mkdir(environmentLocalTempFolder, {recursive: true}); - } - const dlcFolder: string = this.experimentRootDir.replace( - this.config.localStorageMountPoint, this.config.containerStorageMountPoint); - dlcEnvironment.workingFolder = `${this.experimentRootDir}/envs/${environment.id}`; - dlcEnvironment.runnerWorkingFolder = `${dlcFolder}/envs/${environment.id}`; - let script: string = environment.command; + const environmentRoot = path.join(this.config.containerStorageMountPoint, `/nni-experiments/${this.experimentId}`); + const localRoot = path.join(this.config.localStorageMountPoint, `/nni-experiments/${this.experimentId}`); + + dlcEnvironment.workingFolder = `${localRoot}/envs/${environment.id}`; + dlcEnvironment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`; - // environment id dir and command dir + // environment id dir and command dir, folder created on DLC side can't be accessed on DSW. if (!fs.existsSync(`${dlcEnvironment.workingFolder}/commands`)) { await fs.promises.mkdir(`${dlcEnvironment.workingFolder}/commands`, {recursive: true}); } - const prepare = `cd ${dlcEnvironment.runnerWorkingFolder} && cp -r ../../environment-temp/envs/* ../`; - const startrun = `sh ../install_nni.sh && python -m nni.tools.trial_tool.trial_runner`; - - script = `${prepare} && ${startrun}`; - script = `${script} --job_pid_file ${environment.runnerWorkingFolder}/pid \ - 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`; + environment.command = `cd ${environmentRoot} && ${environment.command}`; + environment.command = `${environment.command} 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr`; const dlcClient = new DlcClient( this.config.type, @@ -126,7 +120,7 @@ export class DlcEnvironmentService extends EnvironmentService { this.config.nasDataSourceId, this.config.accessKeyId, this.config.accessKeySecret, - script, + environment.command, ); dlcEnvironment.id = await dlcClient.submit();