From 19f95f019a2015b68f0e8a91cef54205bb84570c Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Wed, 24 Mar 2021 00:31:04 +0800 Subject: [PATCH 1/3] init --- nni/tools/trial_tool/trial.py | 2 +- .../environments/remoteEnvironmentService.ts | 32 ++++++------------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/nni/tools/trial_tool/trial.py b/nni/tools/trial_tool/trial.py index 1da398d017..67675dc308 100644 --- a/nni/tools/trial_tool/trial.py +++ b/nni/tools/trial_tool/trial.py @@ -47,7 +47,7 @@ def run(self): nni_log(LogType.Info, "%s: start to run trial" % self.name) - trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "trials", self.id)) self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) trial_code_dir = os.path.join(trial_working_dir, "code") trial_nnioutput_dir = os.path.join(trial_working_dir, "nnioutput") diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index bba79e0cf0..b83e44d131 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -20,7 +20,6 @@ import { } from '../../remote_machine/remoteMachineData'; import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; import { RemoteMachineEnvironmentInformation } from '../remote/remoteConfig'; -import { SharedStorageService } from '../sharedStorage' @component.Singleton @@ -129,7 +128,7 @@ export class RemoteEnvironmentService extends EnvironmentService { this.log.debug(`initializing ${executor.name}`); // Create root working directory after executor is ready - const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni-experiments'); + const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni'); await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId())); // the directory to store temp scripts in remote machine @@ -248,20 +247,13 @@ export class RemoteEnvironmentService extends EnvironmentService { } this.environmentExecutorManagerMap.set(environment.id, executorManager); const executor = await this.getExecutor(environment.id); - if (environment.useSharedStorage) { - const environmentRoot = component.get(SharedStorageService).remoteWorkingRoot; - environment.runnerWorkingFolder = executor.joinPath(environmentRoot, 'envs', environment.id) - const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; - await executor.executeScript(remoteMountCommand, false, false); - } else { - environment.runnerWorkingFolder = - executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), - 'envs', environment.id) - } - environment.command = `cd ${environment.runnerWorkingFolder} && \ - ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ - 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ - && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + environment.runnerWorkingFolder = + executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), + 'envs', environment.id) + environment.command = `cd ${executor.getRemoteExperimentRootDir(getExperimentId())} && \ +${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ +1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ +&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; return Promise.resolve(true); } } @@ -272,13 +264,13 @@ export class RemoteEnvironmentService extends EnvironmentService { } const executor = await this.getExecutor(environment.id); const environmentLocalTempFolder: string = - path.join(this.experimentRootDir, "environment-temp") + path.join(this.experimentRootDir, this.experimentId, "environment-temp") await executor.createFolder(environment.runnerWorkingFolder); await execMkdir(environmentLocalTempFolder); await fs.promises.writeFile(path.join(environmentLocalTempFolder, executor.getScriptName("run")), environment.command, { encoding: 'utf8' }); // Copy files in codeDir to remote working directory - await executor.copyDirectoryToRemote(environmentLocalTempFolder, environment.runnerWorkingFolder); + await executor.copyDirectoryToRemote(environmentLocalTempFolder, executor.getRemoteExperimentRootDir(getExperimentId())); // Execute command in remote machine, set isInteractive=true to run script in conda environment executor.executeScript(executor.joinPath(environment.runnerWorkingFolder, executor.getScriptName("run")), true, true); @@ -297,10 +289,6 @@ export class RemoteEnvironmentService extends EnvironmentService { } public async stopEnvironment(environment: EnvironmentInformation): Promise { - if (environment.isAlive === false) { - return Promise.resolve(); - } - const executor = await this.getExecutor(environment.id); if (environment.status === 'UNKNOWN') { From 93a0345161467f634c6ade9b1007267695a0a192 Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Wed, 24 Mar 2021 00:53:21 +0800 Subject: [PATCH 2/3] update --- .../environments/remoteEnvironmentService.ts | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts index b83e44d131..29564668a7 100644 --- a/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts +++ b/ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts @@ -20,6 +20,7 @@ import { } from '../../remote_machine/remoteMachineData'; import { ShellExecutor } from 'training_service/remote_machine/shellExecutor'; import { RemoteMachineEnvironmentInformation } from '../remote/remoteConfig'; +import { SharedStorageService } from '../sharedStorage' @component.Singleton @@ -33,6 +34,7 @@ export class RemoteEnvironmentService extends EnvironmentService { private readonly log: Logger; private sshConnectionPromises: any[]; private experimentRootDir: string; + private remoteExperimentRootDir: string = ""; private experimentId: string; constructor() { @@ -128,7 +130,7 @@ export class RemoteEnvironmentService extends EnvironmentService { this.log.debug(`initializing ${executor.name}`); // Create root working directory after executor is ready - const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni'); + const nniRootDir: string = executor.joinPath(executor.getTempPath(), 'nni-experiments'); await executor.createFolder(executor.getRemoteExperimentRootDir(getExperimentId())); // the directory to store temp scripts in remote machine @@ -247,13 +249,18 @@ export class RemoteEnvironmentService extends EnvironmentService { } this.environmentExecutorManagerMap.set(environment.id, executorManager); const executor = await this.getExecutor(environment.id); - environment.runnerWorkingFolder = - executor.joinPath(executor.getRemoteExperimentRootDir(getExperimentId()), - 'envs', environment.id) - environment.command = `cd ${executor.getRemoteExperimentRootDir(getExperimentId())} && \ -${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ -1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ -&& echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; + if (environment.useSharedStorage) { + this.remoteExperimentRootDir = component.get(SharedStorageService).remoteWorkingRoot; + const remoteMountCommand = component.get(SharedStorageService).remoteMountCommand; + await executor.executeScript(remoteMountCommand, false, false); + } else { + this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId()); + } + environment.runnerWorkingFolder = executor.joinPath(this.remoteExperimentRootDir, 'envs', environment.id); + environment.command = `cd ${this.remoteExperimentRootDir} && \ + ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ + 1>${environment.runnerWorkingFolder}/trialrunner_stdout 2>${environment.runnerWorkingFolder}/trialrunner_stderr \ + && echo $? \`date +%s%3N\` >${environment.runnerWorkingFolder}/code`; return Promise.resolve(true); } } @@ -264,13 +271,13 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ } const executor = await this.getExecutor(environment.id); const environmentLocalTempFolder: string = - path.join(this.experimentRootDir, this.experimentId, "environment-temp") + path.join(this.experimentRootDir, "environment-temp") await executor.createFolder(environment.runnerWorkingFolder); await execMkdir(environmentLocalTempFolder); await fs.promises.writeFile(path.join(environmentLocalTempFolder, executor.getScriptName("run")), environment.command, { encoding: 'utf8' }); // Copy files in codeDir to remote working directory - await executor.copyDirectoryToRemote(environmentLocalTempFolder, executor.getRemoteExperimentRootDir(getExperimentId())); + await executor.copyDirectoryToRemote(environmentLocalTempFolder, this.remoteExperimentRootDir); // Execute command in remote machine, set isInteractive=true to run script in conda environment executor.executeScript(executor.joinPath(environment.runnerWorkingFolder, executor.getScriptName("run")), true, true); @@ -289,6 +296,10 @@ ${environment.command} --job_pid_file ${environment.runnerWorkingFolder}/pid \ } public async stopEnvironment(environment: EnvironmentInformation): Promise { + if (environment.isAlive === false) { + return Promise.resolve(); + } + const executor = await this.getExecutor(environment.id); if (environment.status === 'UNKNOWN') { From 942b196b7d6d296f903f732061409cc3f4ace91c Mon Sep 17 00:00:00 2001 From: SparkSnail Date: Mon, 29 Mar 2021 11:56:44 +0800 Subject: [PATCH 3/3] revert change --- nni/tools/trial_tool/trial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/tools/trial_tool/trial.py b/nni/tools/trial_tool/trial.py index 67675dc308..1da398d017 100644 --- a/nni/tools/trial_tool/trial.py +++ b/nni/tools/trial_tool/trial.py @@ -47,7 +47,7 @@ def run(self): nni_log(LogType.Info, "%s: start to run trial" % self.name) - trial_working_dir = os.path.realpath(os.path.join(os.curdir, "trials", self.id)) + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) trial_code_dir = os.path.join(trial_working_dir, "code") trial_nnioutput_dir = os.path.join(trial_working_dir, "nnioutput")