diff --git a/src/nni_manager/training_service/local/gpuScheduler.ts b/src/nni_manager/training_service/local/gpuScheduler.ts index 933235a222..017a2af38c 100644 --- a/src/nni_manager/training_service/local/gpuScheduler.ts +++ b/src/nni_manager/training_service/local/gpuScheduler.ts @@ -54,6 +54,9 @@ class GPUScheduler { } catch (error) { this.log.error('Read GPU summary failed with error: ', error); } + if (this.gpuSummary !== undefined && this.gpuSummary.gpuCount === 0) { + throw new Error('GPU not available. Please check your CUDA configuration'); + } await delay(5000); } } diff --git a/src/nni_manager/training_service/local/localTrainingService.ts b/src/nni_manager/training_service/local/localTrainingService.ts index 1fb2cb9327..c47be9ec32 100644 --- a/src/nni_manager/training_service/local/localTrainingService.ts +++ b/src/nni_manager/training_service/local/localTrainingService.ts @@ -131,7 +131,7 @@ class LocalTrainingService implements TrainingService { private readonly occupiedGpuIndexNumMap: Map; private designatedGpuIndices!: Set; private readonly log: Logger; - private localTrailConfig?: TrialConfig; + private localTrialConfig?: TrialConfig; private localConfig?: LocalConfig; private isMultiPhase: boolean; private readonly jobStreamMap: Map; @@ -204,7 +204,7 @@ class LocalTrainingService implements TrainingService { } catch (error) { //ignore } - this.log.debug(`trailJob status update: ${trialJobId}, ${trialJob.status}`); + this.log.debug(`trialJob status update: ${trialJobId}, ${trialJob.status}`); } } @@ -302,14 +302,14 @@ class LocalTrainingService implements TrainingService { } switch (key) { case TrialConfigMetadataKey.TRIAL_CONFIG: - this.localTrailConfig = JSON.parse(value); + this.localTrialConfig = JSON.parse(value); // Parse trial config failed, throw Error - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error('trial config parsed failed'); } - if (this.localTrailConfig.gpuNum !== undefined) { - this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`); - if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) { + if (this.localTrialConfig.gpuNum !== undefined) { + this.log.info(`required GPU number is ${this.localTrialConfig.gpuNum}`); + if (this.gpuScheduler === undefined && this.localTrialConfig.gpuNum > 0) { this.gpuScheduler = new GPUScheduler(); } } @@ -343,10 +343,10 @@ class LocalTrainingService implements TrainingService { switch (key) { case TrialConfigMetadataKey.TRIAL_CONFIG: let getResult: Promise; - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`)); } else { - getResult = Promise.resolve(JSON.stringify(this.localTrailConfig)); + getResult = Promise.resolve(JSON.stringify(this.localTrialConfig)); } return getResult; @@ -427,8 +427,8 @@ class LocalTrainingService implements TrainingService { } private tryGetAvailableResource(): [boolean, { gpuIndices: number[]}] { - if (this.localTrailConfig === undefined) { - throw new Error('localTrailConfig is not initialized!'); + if (this.localTrialConfig === undefined) { + throw new Error('localTrialConfig is not initialized!'); } const resource: { gpuIndices: number[] } = { gpuIndices: [] }; @@ -450,11 +450,11 @@ class LocalTrainingService implements TrainingService { selectedGPUIndices = selectedGPUIndices.filter((index: number) => this.designatedGpuIndices.has(index)); } - if (selectedGPUIndices.length < this.localTrailConfig.gpuNum) { + if (selectedGPUIndices.length < this.localTrialConfig.gpuNum) { return [false, resource]; } - selectedGPUIndices.splice(this.localTrailConfig.gpuNum); + selectedGPUIndices.splice(this.localTrialConfig.gpuNum); Object.assign(resource, { gpuIndices: selectedGPUIndices }); return [true, resource]; @@ -512,17 +512,17 @@ class LocalTrainingService implements TrainingService { } } - private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[] { + private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] { const script: string[] = []; if (process.platform === 'win32') { script.push( - `cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, + `cmd /c ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`, `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`, `Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`); } else { script.push( - `eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, + `eval ${localTrialConfig.command} 2>${path.join(workingDirectory, 'stderr')}`, `echo $? \`date +%s%3N\` >${path.join(workingDirectory, '.nni', 'state')}`); } @@ -531,23 +531,23 @@ class LocalTrainingService implements TrainingService { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise { const trialJobDetail: LocalTrialJobDetail = this.jobMap.get(trialJobId); - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error(`localTrialConfig not initialized!`); } - const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum); + const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrialConfig.gpuNum); - if (this.localTrailConfig === undefined) { + if (this.localTrialConfig === undefined) { throw new Error('trial config is not initialized'); } const runScriptContent: string[] = []; if (process.platform !== 'win32') { runScriptContent.push('#!/bin/bash'); } - runScriptContent.push(`cd ${this.localTrailConfig.codeDir}`); + runScriptContent.push(`cd ${this.localTrialConfig.codeDir}`); for (const variable of variables) { runScriptContent.push(setEnvironmentVariable(variable)); } - const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory); + const scripts: string[] = this.getScript(this.localTrialConfig, trialJobDetail.workingDirectory); scripts.forEach((script: string) => { runScriptContent.push(script); }); diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index c55c28427b..35631f1ce9 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -511,12 +511,16 @@ class RemoteMachineTrainingService implements TrainingService { // tslint:disable-next-line: no-floating-promises SSHClientUtility.remoteExeCommand(`bash ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics_collector.sh')}`, conn); - this.timer.subscribe( + const disposable: Rx.IDisposable = this.timer.subscribe( async (tick: number) => { const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand( `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn); if (cmdresult !== undefined && cmdresult.stdout !== undefined) { rmMeta.gpuSummary = JSON.parse(cmdresult.stdout); + if (rmMeta.gpuSummary.gpuCount === 0) { + this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`); + this.timer.unsubscribe(disposable); + } } } ); diff --git a/tools/nni_gpu_tool/gpu_metrics_collector.py b/tools/nni_gpu_tool/gpu_metrics_collector.py index 56095a0362..f58b9b2895 100644 --- a/tools/nni_gpu_tool/gpu_metrics_collector.py +++ b/tools/nni_gpu_tool/gpu_metrics_collector.py @@ -21,6 +21,7 @@ import subprocess import sys import time +import traceback from xml.dom import minidom @@ -33,7 +34,7 @@ def check_ready_to_run(): pidList.remove(os.getpid()) return len(pidList) == 0 else: - pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) + pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pidList = [] for pid in pgrep_output.splitlines(): pidList.append(int(pid)) @@ -45,23 +46,21 @@ def main(argv): if check_ready_to_run() == False: # GPU metrics collector is already running. Exit exit(2) - with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: - pass - os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777) - cmd = 'nvidia-smi -q -x' + cmd = 'nvidia-smi -q -x'.split() while(True): try: - smi_output = subprocess.check_output(cmd, shell=True) - parse_nvidia_smi_result(smi_output, metrics_output_dir) - except: - exception = sys.exc_info() - for e in exception: - print("job exporter error {}".format(e)) + smi_output = subprocess.check_output(cmd) + except Exception: + traceback.print_exc() + gen_empty_gpu_metric(metrics_output_dir) + break + parse_nvidia_smi_result(smi_output, metrics_output_dir) # TODO: change to sleep time configurable via arguments time.sleep(5) def parse_nvidia_smi_result(smi, outputDir): try: + old_umask = os.umask(0) xmldoc = minidom.parseString(smi) gpuList = xmldoc.getElementsByTagName('gpu') with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: @@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir): except : e_info = sys.exc_info() print('xmldoc paring error') + finally: + os.umask(old_umask) + +def gen_empty_gpu_metric(outputDir): + try: + old_umask = os.umask(0) + with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: + outPut = {} + outPut["Timestamp"] = time.asctime(time.localtime()) + outPut["gpuCount"] = 0 + outPut["gpuInfos"] = [] + print(outPut) + outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True))) + outputFile.flush() + except Exception: + traceback.print_exc() + finally: + os.umask(old_umask) if __name__ == "__main__":