Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Report the error more gracefully when nvidia-smi not exist #1418

Merged
merged 9 commits into from
Aug 9, 2019
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions tools/nni_gpu_tool/gpu_metrics_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import subprocess
import sys
import time
import traceback

from xml.dom import minidom

Expand All @@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid())
return len(pidList) == 0
else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
Expand All @@ -48,11 +49,13 @@ def main(argv):
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile:
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
smi_output = subprocess.check_output(cmd, shell=True)
smi_output = subprocess.check_output(cmd)
parse_nvidia_smi_result(smi_output, metrics_output_dir)
liuzhe-lz marked this conversation as resolved.
Show resolved Hide resolved
except FileNotFoundError:
gen_empty_gpu_metric(metrics_output_dir)
except:
exception = sys.exc_info()
for e in exception:
Expand Down Expand Up @@ -86,6 +89,19 @@ def parse_nvidia_smi_result(smi, outputDir):
e_info = sys.exc_info()
print('xmldoc paring error')

def gen_empty_gpu_metric(outputDir):
try:
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()


if __name__ == "__main__":
main(sys.argv[1:])