From 466e29a587251703796e27acbb660e4234a69b77 Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Tue, 9 Feb 2021 21:27:10 -0600 Subject: [PATCH] Updated CI scripts to use a different error handling convention, updated LD_LIBRARY_PATH for project flash runs (#1386) * Updated CI scripts to use a different error handling convention * Updated LD_LIBRARY_PATH for project flash runs * Added extra logging to report status after each test command * Added comments * Removed unused "top-20 slowest" report * Minor updates for consistency Tested locally by simulating various error conditions (removed .so files, inserted errors in NBs, killed processes, etc.) and checked exit codes. Still need to verify in a project Flash env, but using CI for that. Authors: - Rick Ratzel (@rlratzel) Approvers: - AJ Schmidt (@ajschmidt8) - Ray Douglass (@raydouglass) - Dillon Cullinan (@dillon-cullinan) URL: https://github.com/rapidsai/cugraph/pull/1386 --- ci/getGTestTimes.sh | 46 ------------------------------------- ci/gpu/build.sh | 28 ++++++++++++++--------- ci/gpu/test-notebooks.sh | 20 ++++++++++++---- ci/test.sh | 49 ++++++++++++++++++++++++---------------- ci/utils/nbtest.sh | 16 +++++++++++-- 5 files changed, 76 insertions(+), 83 deletions(-) delete mode 100755 ci/getGTestTimes.sh diff --git a/ci/getGTestTimes.sh b/ci/getGTestTimes.sh deleted file mode 100755 index 8a3752d76e..0000000000 --- a/ci/getGTestTimes.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# Copyright (c) 2019-2020, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script will print the gtest results sorted by runtime. This will print -# the results two ways: first by printing all tests sorted by runtime, then by -# printing all tests grouped by test binary with tests sorted by runtime within -# the group. -# -# To use this script, capture the test run output to a file then run this script -# with the file as the first arg, or just redirect test output to this script. - -awk '/^Running GoogleTest .+$/ { - testbinary = $3 - } - /^\[ OK \].+$/ { - testtime = substr($(NF-1),2) - newtestdata = testbinary ":" substr($0,14) - alltestdata = alltestdata newtestdata "\n" - testdata[testbinary] = testdata[testbinary] newtestdata "\n" - totaltime = totaltime + testtime - } - END { - # Print all tests sorted by time - system("echo \"" alltestdata "\" | sort -r -t\\( -nk2") - print "\n================================================================================" - # Print test binaries with tests sorted by time - print "Tests grouped by test binary:" - for (testbinary in testdata) { - print testbinary - system("echo \"" testdata[testbinary] "\" | sort -r -t\\( -nk2") - } - print "\n================================================================================" - print totaltime " milliseconds = " totaltime/60000 " minutes" - } -' $1 diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 019d03e21d..0fef7b62f8 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. ########################################## # cuGraph GPU build & testscript for CI # ########################################## -set -e -set -o pipefail +set -e # abort the script on error, this will change for running tests (see below) +set -o pipefail # piped commands propagate their error NUMARGS=$# ARGS=$* @@ -98,10 +98,15 @@ fi # TEST - Run GoogleTest and py.tests for libcugraph and cuGraph ################################################################################ -set +e -Eo pipefail -EXITCODE=0 +# Switch to +e to allow failing commands to continue the script, which is needed +# so all testing commands run regardless of pass/fail. This requires the desired +# exit code to be managed using the ERR trap. +set +e # allow script to continue on error +set -E # ERR traps are inherited by subcommands trap "EXITCODE=1" ERR +EXITCODE=0 + if hasArg --skip-tests; then gpuci_logger "Skipping Tests" else @@ -117,18 +122,19 @@ else TEST_MODE_FLAG="" fi + gpuci_logger "Running cuGraph test.sh..." ${WORKSPACE}/ci/test.sh ${TEST_MODE_FLAG} | tee testoutput.txt + gpuci_logger "Ran cuGraph test.sh : return code was: $?, gpu/build.sh exit code is now: $EXITCODE" - echo -e "\nTOP 20 SLOWEST TESTS:\n" - # Wrap in echo to prevent non-zero exit since this command is non-essential - echo "$(${WORKSPACE}/ci/getGTestTimes.sh testoutput.txt | head -20)" - + gpuci_logger "Running cuGraph notebook test script..." ${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log + gpuci_logger "Ran cuGraph notebook test script : return code was: $?, gpu/build.sh exit code is now: $EXITCODE" python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log fi -if [ -n "\${CODECOV_TOKEN}" ]; then - codecov -t \$CODECOV_TOKEN +if [ -n "${CODECOV_TOKEN}" ]; then + codecov -t $CODECOV_TOKEN fi +gpuci_logger "gpu/build.sh returning value: $EXITCODE" return ${EXITCODE} diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh index f5f768d7f1..650132f116 100755 --- a/ci/gpu/test-notebooks.sh +++ b/ci/gpu/test-notebooks.sh @@ -12,10 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -#RAPIDS_DIR=/rapids +# Any failing command will set EXITCODE to non-zero +set -e # abort the script on error, this will change for running tests (see below) +set -o pipefail # piped commands propagate their error +set -E # ERR traps are inherited by subcommands +trap "EXITCODE=1" ERR + NOTEBOOKS_DIR=${WORKSPACE}/notebooks NBTEST=${WORKSPACE}/ci/utils/nbtest.sh LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache +EXITCODE=0 cd ${NOTEBOOKS_DIR} TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u) @@ -23,7 +29,10 @@ TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u) ## Check env env -EXITCODE=0 +# Do not abort the script on error. This allows all tests to run regardless of +# pass/fail but relies on the ERR trap above to manage the EXITCODE for the +# script. +set +e # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure # if any run fails @@ -32,12 +41,14 @@ for folder in ${TOPLEVEL_NB_FOLDERS}; do echo "FOLDER: ${folder}" echo "========================================" cd ${NOTEBOOKS_DIR}/${folder} - for nb in $(python ${WORKSPACE}/ci/gpu/notebook_list.py); do + NBLIST=$(python ${WORKSPACE}/ci/gpu/notebook_list.py) + for nb in ${NBLIST}; do nbBasename=$(basename ${nb}) cd $(dirname ${nb}) nvidia-smi ${NBTEST} ${nbBasename} - EXITCODE=$((EXITCODE | $?)) + echo "Ran nbtest for $nb : return code was: $?, test script exit code is now: $EXITCODE" + echo rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/* cd ${NOTEBOOKS_DIR}/${folder} done @@ -45,4 +56,5 @@ done nvidia-smi +echo "Notebook test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/ci/test.sh b/ci/test.sh index c173088862..db060d3a55 100755 --- a/ci/test.sh +++ b/ci/test.sh @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -# note: do not use set -e in order to allow all gtest invocations to take place, -# and instead keep track of exit status and exit with an overall exit status -set -o pipefail +# Any failing command will set EXITCODE to non-zero +set -e # abort the script on error, this will change for running tests (see below) +set -o pipefail # piped commands propagate their error +set -E # ERR traps are inherited by subcommands +trap "EXITCODE=1" ERR NUMARGS=$# ARGS=$* @@ -22,7 +24,7 @@ THISDIR=$(cd $(dirname $0);pwd) CUGRAPH_ROOT=$(cd ${THISDIR}/..;pwd) GTEST_ARGS="--gtest_output=xml:${CUGRAPH_ROOT}/test-results/" DOWNLOAD_MODE="" -ERRORCODE=0 +EXITCODE=0 export RAPIDS_DATASET_ROOT_DIR=${CUGRAPH_ROOT}/datasets @@ -50,27 +52,20 @@ else echo "Download datasets..." cd ${RAPIDS_DATASET_ROOT_DIR} bash ./get_test_data.sh ${DOWNLOAD_MODE} - ERRORCODE=$((ERRORCODE | $?)) - # no need to run tests if dataset download fails - if (( ${ERRORCODE} != 0 )); then - exit ${ERRORCODE} - fi fi if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then cd ${CUGRAPH_ROOT}/cpp/build else - export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$LD_LIBRARY_PATH" + export LD_LIBRARY_PATH="$WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" cd $WORKSPACE/ci/artifacts/cugraph/cpu/conda_work/cpp/build fi -for gt in tests/*_TEST; do - test_name=$(basename $gt) - echo "Running GoogleTest $test_name" - ${gt} ${GTEST_FILTER} ${GTEST_ARGS} - ERRORCODE=$((ERRORCODE | $?)) -done - +# FIXME: if possible, any install and build steps should be moved outside this +# script since a failing install/build step is treated as a failing test command +# and will not stop the script. This script is also only expected to run tests +# in a preconfigured environment, and install/build steps are unexpected side +# effects. if [[ "$PROJECT_FLASH" == "1" ]]; then CONDA_FILE=`find $WORKSPACE/ci/artifacts/cugraph/cpu/conda-bld/ -name "libcugraph*.tar.bz2"` CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension @@ -83,14 +78,28 @@ if [[ "$PROJECT_FLASH" == "1" ]]; then $WORKSPACE/build.sh cugraph fi +# Do not abort the script on error from this point on. This allows all tests to +# run regardless of pass/fail, but relies on the ERR trap above to manage the +# EXITCODE for the script. +set +e + +echo "C++ gtests for cuGraph..." +for gt in tests/*_TEST; do + test_name=$(basename $gt) + echo "Running gtest $test_name" + ${gt} ${GTEST_FILTER} ${GTEST_ARGS} + echo "Ran gtest $test_name : return code was: $?, test script exit code is now: $EXITCODE" +done + echo "Python pytest for cuGraph..." cd ${CUGRAPH_ROOT}/python pytest --cache-clear --junitxml=${CUGRAPH_ROOT}/junit-cugraph.xml -v --cov-config=.coveragerc --cov=cugraph --cov-report=xml:${WORKSPACE}/python/cugraph/cugraph-coverage.xml --cov-report term --ignore=cugraph/raft --benchmark-disable -ERRORCODE=$((ERRORCODE | $?)) +echo "Ran Python pytest for cugraph : return code was: $?, test script exit code is now: $EXITCODE" echo "Python benchmarks for cuGraph (running as tests)..." cd ${CUGRAPH_ROOT}/benchmarks pytest -v -m "managedmem_on and poolallocator_on and tiny" --benchmark-disable -ERRORCODE=$((ERRORCODE | $?)) +echo "Ran Python benchmarks for cuGraph (running as tests) : return code was: $?, test script exit code is now: $EXITCODE" -exit ${ERRORCODE} +echo "Test script exiting with value: $EXITCODE" +exit ${EXITCODE} diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh index 8c86baeaa0..ae8b52df10 100755 --- a/ci/utils/nbtest.sh +++ b/ci/utils/nbtest.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,6 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Any failing command will set EXITCODE to non-zero +set +e # do not abort the script on error +set -o pipefail # piped commands propagate their error +set -E # ERR traps are inherited by subcommands +trap "EXITCODE=1" ERR + +# Prepend the following code to all scripts generated from nbconvert. This +# allows all cell and line magic code to run and update the namespace as if +# running in jupyter, but will also tolerate failures due to running in a +# non-jupyter env. +# Note: depending on the assumptions of the notebook script, ignoring failures +# may not be acceptable (meaning the converted notebook simply cannot run +# outside of jupyter as-is), hence the warning. MAGIC_OVERRIDE_CODE=" def my_run_line_magic(*args, **kwargs): g=globals() @@ -58,7 +71,6 @@ for nb in $*; do NBEXITCODE=$? echo EXIT CODE: ${NBEXITCODE} echo - EXITCODE=$((EXITCODE | ${NBEXITCODE})) done exit ${EXITCODE}