From 466e29a587251703796e27acbb660e4234a69b77 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Tue, 9 Feb 2021 21:27:10 -0600
Subject: [PATCH] Updated CI scripts to use a different error handling
 convention, updated LD_LIBRARY_PATH for project flash runs (#1386)

* Updated CI scripts to use a different error handling convention
* Updated LD_LIBRARY_PATH for project flash runs
* Added extra logging to report status after each test command
* Added comments
* Removed unused "top-20 slowest" report
* Minor updates for consistency

Tested locally by simulating various error conditions (removed .so files, inserted errors in NBs, killed processes, etc.) and checked exit codes.  Still need to verify in a project Flash env, but using CI for that.

Authors:
  - Rick Ratzel (@rlratzel)

Approvers:
  - AJ Schmidt (@ajschmidt8)
  - Ray Douglass (@raydouglass)
  - Dillon Cullinan (@dillon-cullinan)

URL: https://github.com/rapidsai/cugraph/pull/1386
---
 ci/getGTestTimes.sh      | 46 -------------------------------------
 ci/gpu/build.sh          | 28 ++++++++++++++---------
 ci/gpu/test-notebooks.sh | 20 ++++++++++++----
 ci/test.sh               | 49 ++++++++++++++++++++++++----------------
 ci/utils/nbtest.sh       | 16 +++++++++++--
 5 files changed, 76 insertions(+), 83 deletions(-)
 delete mode 100755 ci/getGTestTimes.sh

diff --git a/ci/getGTestTimes.sh b/ci/getGTestTimes.sh
deleted file mode 100755
index 8a3752d76e..0000000000
--- a/ci/getGTestTimes.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script will print the gtest results sorted by runtime. This will print
-# the results two ways: first by printing all tests sorted by runtime, then by
-# printing all tests grouped by test binary with tests sorted by runtime within
-# the group.
-#
-# To use this script, capture the test run output to a file then run this script
-# with the file as the first arg, or just redirect test output to this script.
-
-awk '/^Running GoogleTest .+$/ {
-       testbinary = $3
-     }
-     /^\[       OK \].+$/ {
-        testtime = substr($(NF-1),2)
-        newtestdata = testbinary ":" substr($0,14)
-        alltestdata = alltestdata newtestdata "\n"
-        testdata[testbinary] = testdata[testbinary] newtestdata "\n"
-        totaltime = totaltime + testtime
-     }
-     END {
-        # Print all tests sorted by time
-        system("echo \"" alltestdata "\" | sort -r -t\\( -nk2")
-        print "\n================================================================================"
-        # Print test binaries with tests sorted by time
-        print "Tests grouped by test binary:"
-        for (testbinary in testdata) {
-           print testbinary
-           system("echo \"" testdata[testbinary] "\" | sort -r -t\\( -nk2")
-        }
-        print "\n================================================================================"
-        print totaltime " milliseconds = " totaltime/60000 " minutes"
-     }
-' $1
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 019d03e21d..0fef7b62f8 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##########################################
 # cuGraph GPU build & testscript for CI  #
 ##########################################
-set -e
-set -o pipefail
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
 NUMARGS=$#
 ARGS=$*
 
@@ -98,10 +98,15 @@ fi
 # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph
 ################################################################################
 
-set +e -Eo pipefail
-EXITCODE=0
+# Switch to +e to allow failing commands to continue the script, which is needed
+# so all testing commands run regardless of pass/fail. This requires the desired
+# exit code to be managed using the ERR trap.
+set +e           # allow script to continue on error
+set -E           # ERR traps are inherited by subcommands
 trap "EXITCODE=1" ERR
 
+EXITCODE=0
+
 if hasArg --skip-tests; then
     gpuci_logger "Skipping Tests"
 else
@@ -117,18 +122,19 @@ else
         TEST_MODE_FLAG=""
     fi
 
+    gpuci_logger "Running cuGraph test.sh..."
     ${WORKSPACE}/ci/test.sh ${TEST_MODE_FLAG} | tee testoutput.txt
+    gpuci_logger "Ran cuGraph test.sh : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
 
-    echo -e "\nTOP 20 SLOWEST TESTS:\n"
-    # Wrap in echo to prevent non-zero exit since this command is non-essential
-    echo "$(${WORKSPACE}/ci/getGTestTimes.sh testoutput.txt | head -20)"
-
+    gpuci_logger "Running cuGraph notebook test script..."
     ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
+    gpuci_logger "Ran cuGraph notebook test script : return code was: $?, gpu/build.sh exit code is now: $EXITCODE"
     python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
 fi
 
-if [ -n "\${CODECOV_TOKEN}" ]; then
-    codecov -t \$CODECOV_TOKEN
+if [ -n "${CODECOV_TOKEN}" ]; then
+    codecov -t $CODECOV_TOKEN
 fi
 
+gpuci_logger "gpu/build.sh returning value: $EXITCODE"
 return ${EXITCODE}
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index f5f768d7f1..650132f116 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -12,10 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#RAPIDS_DIR=/rapids
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
 NOTEBOOKS_DIR=${WORKSPACE}/notebooks
 NBTEST=${WORKSPACE}/ci/utils/nbtest.sh
 LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
+EXITCODE=0
 
 cd ${NOTEBOOKS_DIR}
 TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
@@ -23,7 +29,10 @@ TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
 ## Check env
 env
 
-EXITCODE=0
+# Do not abort the script on error. This allows all tests to run regardless of
+# pass/fail but relies on the ERR trap above to manage the EXITCODE for the
+# script.
+set +e
 
 # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
 # if any run fails
@@ -32,12 +41,14 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do
     echo "FOLDER: ${folder}"
     echo "========================================"
     cd ${NOTEBOOKS_DIR}/${folder}
-    for nb in $(python ${WORKSPACE}/ci/gpu/notebook_list.py); do
+    NBLIST=$(python ${WORKSPACE}/ci/gpu/notebook_list.py)
+    for nb in ${NBLIST}; do
         nbBasename=$(basename ${nb})
         cd $(dirname ${nb})
         nvidia-smi
         ${NBTEST} ${nbBasename}
-        EXITCODE=$((EXITCODE | $?))
+        echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE"
+        echo
         rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
         cd ${NOTEBOOKS_DIR}/${folder}
     done
@@ -45,4 +56,5 @@ done
 
 nvidia-smi
 
+echo "Notebook test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test.sh b/ci/test.sh
index c173088862..db060d3a55 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# note: do not use set -e in order to allow all gtest invocations to take place,
-# and instead keep track of exit status and exit with an overall exit status
-set -o pipefail
+# Any failing command will set EXITCODE to non-zero
+set -e           # abort the script on error, this will change for running tests (see below)
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
 
 NUMARGS=$#
 ARGS=$*
@@ -22,7 +24,7 @@ THISDIR=$(cd $(dirname $0);pwd)
 CUGRAPH_ROOT=$(cd ${THISDIR}/..;pwd)
 GTEST_ARGS="--gtest_output=xml:${CUGRAPH_ROOT}/test-results/"
 DOWNLOAD_MODE=""
-ERRORCODE=0
+EXITCODE=0
 
 export RAPIDS_DATASET_ROOT_DIR=${CUGRAPH_ROOT}/datasets
 
@@ -50,27 +52,20 @@ else
     echo "Download datasets..."
     cd ${RAPIDS_DATASET_ROOT_DIR}
     bash ./get_test_data.sh ${DOWNLOAD_MODE}
-    ERRORCODE=$((ERRORCODE | $?))
-    # no need to run tests if dataset download fails
-    if (( ${ERRORCODE} != 0 )); then
-        exit ${ERRORCODE}
-    fi
 fi
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     cd ${CUGRAPH_ROOT}/cpp/build
 else
-    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$LD_LIBRARY_PATH"
+    export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
     cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build
 fi
 
-for gt in tests/*_TEST; do
-    test_name=$(basename $gt)
-    echo "Running GoogleTest $test_name"
-    ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
-    ERRORCODE=$((ERRORCODE | $?))
-done
-
+# FIXME: if possible, any install and build steps should be moved outside this
+# script since a failing install/build step is treated as a failing test command
+# and will not stop the script. This script is also only expected to run tests
+# in a preconfigured environment, and install/build steps are unexpected side
+# effects.
 if [[ "$PROJECT_FLASH" == "1" ]]; then
     CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"`
     CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension
@@ -83,14 +78,28 @@ if [[ "$PROJECT_FLASH" == "1" ]]; then
     $WORKSPACE/build.sh cugraph
 fi
 
+# Do not abort the script on error from this point on. This allows all tests to
+# run regardless of pass/fail, but relies on the ERR trap above to manage the
+# EXITCODE for the script.
+set +e
+
+echo "C++ gtests for cuGraph..."
+for gt in tests/*_TEST; do
+    test_name=$(basename $gt)
+    echo "Running gtest $test_name"
+    ${gt} ${GTEST_FILTER} ${GTEST_ARGS}
+    echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE"
+done
+
 echo "Python pytest for cuGraph..."
 cd ${CUGRAPH_ROOT}/python
 pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=cugraph/raft --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python pytest for cugraph : return code was: $?, test script exit code is now: $EXITCODE"
 
 echo "Python benchmarks for cuGraph (running as tests)..."
 cd ${CUGRAPH_ROOT}/benchmarks
 pytest -v -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable
-ERRORCODE=$((ERRORCODE | $?))
+echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE"
 
-exit ${ERRORCODE}
+echo "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index 8c86baeaa0..ae8b52df10 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Any failing command will set EXITCODE to non-zero
+set +e           # do not abort the script on error
+set -o pipefail  # piped commands propagate their error
+set -E           # ERR traps are inherited by subcommands
+trap "EXITCODE=1" ERR
+
+# Prepend the following code to all scripts generated from nbconvert.  This
+# allows all cell and line magic code to run and update the namespace as if
+# running in jupyter, but will also tolerate failures due to running in a
+# non-jupyter env.
+# Note: depending on the assumptions of the notebook script, ignoring failures
+# may not be acceptable (meaning the converted notebook simply cannot run
+# outside of jupyter as-is), hence the warning.
 MAGIC_OVERRIDE_CODE="
 def my_run_line_magic(*args, **kwargs):
     g=globals()
@@ -58,7 +71,6 @@ for nb in $*; do
     NBEXITCODE=$?
     echo EXIT CODE: ${NBEXITCODE}
     echo
-    EXITCODE=$((EXITCODE | ${NBEXITCODE}))
 done
 
 exit ${EXITCODE}