From fab183c7c9ce265fe69fb00256648a8f45218584 Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Mon, 29 Jul 2019 16:31:08 +0800 Subject: [PATCH 1/7] Handle nvidia-smi not exist problem --- tools/nni_gpu_tool/gpu_metrics_collector.py | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 56095a0362..1da8ccb3ff 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -21,6 +21,7 @@ import subprocess import sys import time +import traceback from xml.dom import minidom @@ -33,7 +34,7 @@ def check_ready_to_run(): pidList.remove(os.getpid()) return len(pidList) == 0 else: - pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) + pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pidList = [] for pid in pgrep_output.splitlines(): pidList.append(int(pid)) @@ -48,11 +49,13 @@ def main(argv): with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: pass os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777) - cmd = 'nvidia-smi -q -x' + cmd = 'nvidia-smi -q -x'.split() while(True): try: - smi_output = subprocess.check_output(cmd, shell=True) + smi_output = subprocess.check_output(cmd) parse_nvidia_smi_result(smi_output, metrics_output_dir) + except FileNotFoundError: + gen_empty_gpu_metric(smi_output) except: exception = sys.exc_info() for e in exception: @@ -86,6 +89,19 @@ def parse_nvidia_smi_result(smi, outputDir): e_info = sys.exc_info() print('xmldoc paring error') +def gen_empty_gpu_metric(outputDir): + try: + with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: + outPut = {} + outPut["Timestamp"] = time.asctime(time.localtime()) + outPut["gpuCount"] = 0 + outPut["gpuInfos"] = [] + print(outPut) + outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True))) + outputFile.flush() + except Exception: + traceback.print_exc() + if __name__ == "__main__": main(sys.argv[1:]) From 507434961fb15d3dcb1f696d1de752e4d7fb7245 Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Tue, 6 Aug 2019 13:04:18 +0800 Subject: [PATCH 2/7] bugfix --- tools/nni_gpu_tool/gpu_metrics_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 1da8ccb3ff..7847a52a5c 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -55,7 +55,7 @@ def main(argv): smi_output = subprocess.check_output(cmd) parse_nvidia_smi_result(smi_output, metrics_output_dir) except FileNotFoundError: - gen_empty_gpu_metric(smi_output) + gen_empty_gpu_metric(metrics_output_dir) except: exception = sys.exc_info() for e in exception: From 69239b4bc33ed2e401e30b9ab9fe4298cd1e3c0c Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Thu, 8 Aug 2019 16:11:14 +0800 Subject: [PATCH 3/7] Notify user when GPU not found --- Makefile | 2 +- examples/trials/mnist/config.yml | 2 +- .../training_service/local/gpuScheduler.ts | 3 ++ .../local/localTrainingService.ts | 42 +++++++++---------- .../remoteMachineTrainingService.ts | 6 ++- tools/nni_gpu_tool/gpu_metrics_collector.py | 1 + 6 files changed, 32 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 9217c83dba..8877e5c2ae 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ NNI_NODE_FOLDER = $(NNI_DEPENDENCY_FOLDER)/nni-node-$(OS_SPEC)-x64 NNI_NODE ?= $(BIN_FOLDER)/node NNI_YARN_TARBALL ?= $(NNI_DEPENDENCY_FOLDER)/nni-yarn.tar.gz NNI_YARN_FOLDER ?= $(NNI_DEPENDENCY_FOLDER)/nni-yarn -NNI_YARN := PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn +NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn ## Version number NNI_VERSION_VALUE = $(shell git describe --tags) diff --git a/examples/trials/mnist/config.yml b/examples/trials/mnist/config.yml index 794ca1cef6..231c661ec3 100644 --- a/examples/trials/mnist/config.yml +++ b/examples/trials/mnist/config.yml @@ -18,4 +18,4 @@ tuner: trial: command: python3 mnist.py codeDir: . - gpuNum: 0 + gpuNum: 1 diff --git a/src/nni_manager/training_service/local/gpuScheduler.ts b/src/nni_manager/training_service/local/gpuScheduler.ts index 933235a222..03e8d91afa 100644 --- a/src/nni_manager/training_service/local/gpuScheduler.ts +++ b/src/nni_manager/training_service/local/gpuScheduler.ts @@ -54,6 +54,9 @@ class GPUScheduler { } catch (error) { this.log.error('Read GPU summary failed with error: ', error); } + if (this.gpuSummary.gpuCount === 0) { + throw new Error('GPU not available. Please check your CUDA configuration'); + } await delay(5000); } } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 1fb2cb9327..c47be9ec32 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService { private readonly occupiedGpuIndexNumMap: Map<number, number>; private designatedGpuIndices!: Set<number>; private readonly log: Logger; - private localTrailConfig?: TrialConfig; + private localTrialConfig?: TrialConfig; private localConfig?: LocalConfig; private isMultiPhase: boolean; private readonly jobStreamMap: Map<string, ts.Stream>; @@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService { } catch (error) { //ignore } - this.log.debug(`trailJob status update: ${trialJobId}, ${trialJob.status}`); + this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`); } } @@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService { } switch (key) { case TrialConfigMetadataKey.TRIAL_CONFIG: - this.localTrailConfig = <TrialConfig>JSON.parse(value); + this.localTrialConfig = <TrialConfig>JSON.parse(value); // Parse trial config failed, throw Error - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error('trial config parsed failed'); } - if (this.localTrailConfig.gpuNum !== undefined) { - this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`); - if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) { + if (this.localTrialConfig.gpuNum !== undefined) { + this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`); + if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) { this.gpuScheduler = new GPUScheduler(); } } @@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService { switch (key) { case TrialConfigMetadataKey.TRIAL_CONFIG: let getResult: Promise<string>; - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`)); } else { - getResult = Promise.resolve(JSON.stringify(this.localTrailConfig)); + getResult = Promise.resolve(JSON.stringify(this.localTrialConfig)); } return getResult; @@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService { } private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] { - if (this.localTrailConfig === undefined) { - throw new Error('localTrailConfig is not initialized!'); + if (this.localTrialConfig === undefined) { + throw new Error('localTrialConfig is not initialized!'); } const resource: { gpuIndices: number[] } = { gpuIndices: [] }; @@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService { selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index)); } - if (selectedGPUIndices.length < this.localTrailConfig.gpuNum) { + if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) { return [false, resource]; } - selectedGPUIndices.splice(this.localTrailConfig.gpuNum); + selectedGPUIndices.splice(this.localTrialConfig.gpuNum); Object.assign(resource, { gpuIndices: selectedGPUIndices }); return [true, resource]; @@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService { } } - private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[] { + private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] { const script: string[] = []; if (process.platform === 'win32') { script.push( - `cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, + `cmd /c ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`, `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`, `Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`); } else { script.push( - `eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, + `eval ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`); } @@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> { const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId); - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error(`localTrialConfig not initialized!`); } - const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum); + const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum); - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error('trial config is not initialized'); } const runScriptContent: string[] = []; if (process.platform !== 'win32') { runScriptContent.push('#!/bin/bash'); } - runScriptContent.push(`cd ${this.localTrailConfig.codeDir}`); + runScriptContent.push(`cd ${this.localTrialConfig.codeDir}`); for (const variable of variables) { runScriptContent.push(setEnvironmentVariable(variable)); } - const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory); + const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory); scripts.forEach((script: string) => { runScriptContent.push(script); }); diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index c55c28427b..b9d42f6fc3 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService { // tslint:disable-next-line: no-floating-promises SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); - this.timer.subscribe( + const disposable = this.timer.subscribe( async (tick: number) => { const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult !== undefined && cmdresult.stdout !== undefined) { rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); + if (rmMeta.gpuSummary.gpuCount == 0) { + this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); + this.timer.unsubscribe(disposable); + } } } ); diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 7847a52a5c..41c1878486 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -56,6 +56,7 @@ def main(argv): parse_nvidia_smi_result(smi_output, metrics_output_dir) except FileNotFoundError: gen_empty_gpu_metric(metrics_output_dir) + break except: exception = sys.exc_info() for e in exception: From 674f75b79a55636abfc975886c306a9e5056bdeb Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Thu, 8 Aug 2019 16:18:06 +0800 Subject: [PATCH 4/7] Fix minor issues --- examples/trials/mnist/config.yml | 2 +- .../remote_machine/remoteMachineTrainingService.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/trials/mnist/config.yml b/examples/trials/mnist/config.yml index 231c661ec3..794ca1cef6 100644 --- a/examples/trials/mnist/config.yml +++ b/examples/trials/mnist/config.yml @@ -18,4 +18,4 @@ tuner: trial: command: python3 mnist.py codeDir: . - gpuNum: 1 + gpuNum: 0 diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index b9d42f6fc3..35631f1ce9 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -511,13 +511,13 @@ class RemoteMachineTrainingService implements TrainingService { // tslint:disable-next-line: no-floating-promises SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); - const disposable = this.timer.subscribe( + const disposable: Rx.IDisposable = this.timer.subscribe( async (tick: number) => { const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult !== undefined && cmdresult.stdout !== undefined) { rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); - if (rmMeta.gpuSummary.gpuCount == 0) { + if (rmMeta.gpuSummary.gpuCount === 0) { this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); this.timer.unsubscribe(disposable); } From f09f6ef99ce6a5a8554a54246e48155efe309f97 Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Fri, 9 Aug 2019 09:50:24 +0800 Subject: [PATCH 5/7] Catch other errors and bugfix --- src/nni_manager/training_service/local/gpuScheduler.ts | 2 +- .../remote_machine/remoteMachineTrainingService.ts | 2 +- tools/nni_gpu_tool/gpu_metrics_collector.py | 9 +++------ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/training_service/local/gpuScheduler.ts b/src/nni_manager/training_service/local/gpuScheduler.ts index 03e8d91afa..017a2af38c 100644 --- a/src/nni_manager/training_service/local/gpuScheduler.ts +++ b/src/nni_manager/training_service/local/gpuScheduler.ts @@ -54,7 +54,7 @@ class GPUScheduler { } catch (error) { this.log.error('Read GPU summary failed with error: ', error); } - if (this.gpuSummary.gpuCount === 0) { + if (this.gpuSummary !== undefined && this.gpuSummary.gpuCount === 0) { throw new Error('GPU not available. Please check your CUDA configuration'); } await delay(5000); diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 35631f1ce9..26e172f6fe 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -517,7 +517,7 @@ class RemoteMachineTrainingService implements TrainingService { `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult !== undefined && cmdresult.stdout !== undefined) { rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); - if (rmMeta.gpuSummary.gpuCount === 0) { + if (rmMeta.gpuSummary !== undefined && rmMeta.gpuSummary.gpuCount === 0) { this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); this.timer.unsubscribe(disposable); } diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 41c1878486..6b0681c5e4 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -53,14 +53,11 @@ def main(argv): while(True): try: smi_output = subprocess.check_output(cmd) - parse_nvidia_smi_result(smi_output, metrics_output_dir) - except FileNotFoundError: + except Exception: + traceback.print_exc() gen_empty_gpu_metric(metrics_output_dir) break - except: - exception = sys.exc_info() - for e in exception: - print("job exporter error {}".format(e)) + parse_nvidia_smi_result(smi_output, metrics_output_dir) # TODO: change to sleep time configurable via arguments time.sleep(5) From ea0f70bb52f27287f09a9281e73637974353d70e Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Fri, 9 Aug 2019 10:32:55 +0800 Subject: [PATCH 6/7] Avoid using chmod --- tools/nni_gpu_tool/gpu_metrics_collector.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 6b0681c5e4..f58b9b2895 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -46,9 +46,6 @@ def main(argv): if check_ready_to_run() == False: # GPU metrics collector is already running. Exit exit(2) - with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: - pass - os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777) cmd = 'nvidia-smi -q -x'.split() while(True): try: @@ -63,6 +60,7 @@ def main(argv): def parse_nvidia_smi_result(smi, outputDir): try: + old_umask = os.umask(0) xmldoc = minidom.parseString(smi) gpuList = xmldoc.getElementsByTagName('gpu') with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: @@ -86,9 +84,12 @@ def parse_nvidia_smi_result(smi, outputDir): except : e_info = sys.exc_info() print('xmldoc paring error') + finally: + os.umask(old_umask) def gen_empty_gpu_metric(outputDir): try: + old_umask = os.umask(0) with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: outPut = {} outPut["Timestamp"] = time.asctime(time.localtime()) @@ -99,6 +100,8 @@ def gen_empty_gpu_metric(outputDir): outputFile.flush() except Exception: traceback.print_exc() + finally: + os.umask(old_umask) if __name__ == "__main__": From d0de4bd9f6d9868df024011101eb476bd23c5583 Mon Sep 17 00:00:00 2001 From: liuzhe <zhe.liu@microsoft.com> Date: Fri, 9 Aug 2019 10:40:44 +0800 Subject: [PATCH 7/7] Minor fix --- .../remote_machine/remoteMachineTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 26e172f6fe..35631f1ce9 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -517,7 +517,7 @@ class RemoteMachineTrainingService implements TrainingService { `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult !== undefined && cmdresult.stdout !== undefined) { rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); - if (rmMeta.gpuSummary !== undefined && rmMeta.gpuSummary.gpuCount === 0) { + if (rmMeta.gpuSummary.gpuCount === 0) { this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); this.timer.unsubscribe(disposable); }