From 57658190a1c9603559b4f241b9524c1ccc808575 Mon Sep 17 00:00:00 2001 From: Karel Vesely Date: Wed, 5 Apr 2017 19:26:41 +0200 Subject: [PATCH] [src] nnet1: improving the GPU diagnostics, (#1532) - we auto-detect the 'compute capability' problems (these appear as the 'invalid device function'), - we also provide guidelines what to try before posting to forum, and which info to send to us, --- src/nnetbin/cuda-gpu-available.cc | 74 ++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc index 897f01a8241..89fd26be86f 100644 --- a/src/nnetbin/cuda-gpu-available.cc +++ b/src/nnetbin/cuda-gpu-available.cc @@ -24,9 +24,21 @@ #include "base/kaldi-common.h" #include "cudamatrix/cu-device.h" +#include "cudamatrix/cu-matrix.h" using namespace kaldi; +#if HAVE_CUDA == 1 +/** + * With incorrect CUDA setup, this will trigger "invalid device function" error. + */ +void TestGpuComputation() { + CuMatrix m(100,100); + m.SetRandn(); + m.ApplySoftMaxPerRow(m); +} +#endif + int main(int argc, char *argv[]) try { char hostname[100] = "UNKNOWN-HOSTNAME"; #ifndef _MSC_VER @@ -34,14 +46,33 @@ int main(int argc, char *argv[]) try { KALDI_WARN << "Cannot get hostname, " << strerror(errno); } #endif - std::cerr - << "### IS CUDA GPU AVAILABLE? '" - << hostname << "' ###" << std::endl; + KALDI_LOG << std::endl << std::endl + << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###"; #if HAVE_CUDA == 1 CuDevice::Instantiate().SelectGpuId("yes"); - std::cerr - << "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ###" - << std::endl; + fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n"); + fprintf(stderr, "### Testing CUDA setup with a small computation " + "(setup = cuda-toolkit + gpu-driver + kaldi):\n"); + // the test of setup by computation, + try { + TestGpuComputation(); + } catch (const std::exception &e) { + fprintf(stderr, "%s\n", e.what()); + KALDI_LOG << "...\n" + << "### The CUDA setup is wrong! " + << "(\"invalid device function\" == problem with 'compute capability' " + << "in compiled kaldi)\n" + << "### Before posting the error to forum, please try following:\n" + << "### 1) update kaldi & cuda-toolkit (& GPU driver),\n" + << "### 2) re-run 'src/configure',\n" + << "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n" + << "###\n" + << "### If the problem persists, please send us your:\n" + << "### - GPU model name, cuda-toolkit version, driver version " + << "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk"; + return -1; + } + fprintf(stderr, "### Test OK!\n"); return 0; #else std::cerr @@ -51,26 +82,17 @@ int main(int argc, char *argv[]) try { return 1; #endif } catch (const std::exception &e) { - std::cerr << e.what(); - std::cerr - << "### WE DID NOT GET A CUDA GPU!!! ###" << std::endl - << "### If it's your 1st experiment with CUDA, try reinstalling " - << "'CUDA toolkit' from NVidia web (it contains the drivers)." - << std::endl - << "### In other cases run 'nvidia-smi' in terminal " - << "(gets installed with display drivers) :" - << std::endl - << "### - Check that you see your GPU." - << std::endl - << "### - Bad GPUs are reporting error or disappear from the list " - << "until reboot." - << std::endl - << "### - Check 'Memory-Usage' and 'GPU fan', " - << "which will tell you if the GPU was taken by other process." - << std::endl - << "### - Check there is same version of 'NVIDIA-SMI' and " - << "'Driver', and that it is not too old for your GPU." - << std::endl; + fprintf(stderr, "%s\n", e.what()); + KALDI_LOG << "...\n" + << "### WE DID NOT GET A CUDA GPU!!! ###\n" + << "### If your system has a 'free' CUDA GPU, try re-installing " + << "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n" + << "### Otherwise 'nvidia-smi' shows the status of GPUs:\n" + << "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), " + << "otherwise reboot or reload kernel module,\n" + << "### - The GPU should be unused " + << "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n" + << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),"; return -1; }