From b72f8f1c326dfa640102b5039fe32d7d3c836c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 30 Jan 2024 14:22:10 +0100 Subject: [PATCH 1/5] tests.e2e: Add timeouts to individual steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we are seeing stalled jobs, let's ensure our steps won't take longer than expected. Signed-off-by: Lukáš Doktor --- .github/workflows/ccruntime_e2e.yaml | 2 +- tests/e2e/run-local.sh | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ccruntime_e2e.yaml b/.github/workflows/ccruntime_e2e.yaml index 788eb2f0..674480ee 100644 --- a/.github/workflows/ccruntime_e2e.yaml +++ b/.github/workflows/ccruntime_e2e.yaml @@ -62,7 +62,7 @@ jobs: if [ $RUNNING_INSTANCE = "s390x" ]; then args="" fi - ./run-local.sh -r "${{ matrix.runtimeclass }}" "${args}" + ./run-local.sh -t -r "${{ matrix.runtimeclass }}" "${args}" env: RUNNING_INSTANCE: ${{ matrix.instance }} diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh index bfbb780e..ae3dd826 100755 --- a/tests/e2e/run-local.sh +++ b/tests/e2e/run-local.sh @@ -15,6 +15,7 @@ step_start_cluster=0 step_install_operator=0 runtimeclass="" undo="false" +timeout="false" usage() { cat <<-EOF @@ -29,36 +30,47 @@ usage() { the tests. Defaults to "kata-qemu". -u: undo the installation and configuration before exiting. Useful for baremetal machine were it needs to do clean up for the next tests. + -t: enable default timeout for each operation (useful for CI) EOF } parse_args() { - while getopts "hr:u" opt; do + while getopts "hr:ut" opt; do case $opt in h) usage && exit 0;; r) runtimeclass="$OPTARG";; u) undo="true";; + t) timeout="true";; *) usage && exit 1;; esac done } +run() { + duration=$1; shift + if [ "$timeout" == "true" ]; then + timeout $duration "$@" + else + "$@" + fi +} + undo_changes() { pushd "$script_dir" >/dev/null # Do not try to undo steps that did not execute. if [ $step_install_operator -eq 1 ]; then echo "INFO: Uninstall the operator" - sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true + run 10m sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true fi if [ $step_start_cluster -eq 1 ]; then echo "INFO: Shutdown the cluster" - sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true + run 5m sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true fi if [ $step_bootstrap_env -eq 1 ]; then echo "INFO: Undo the bootstrap" - ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true + run 5m ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true fi popd >/dev/null } @@ -87,19 +99,19 @@ main() { pushd "$script_dir" >/dev/null echo "INFO: Bootstrap the local machine" step_bootstrap_env=1 - ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml + run 10m ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml echo "INFO: Bring up the test cluster" step_start_cluster=1 - sudo -E PATH="$PATH" bash -c './cluster/up.sh' + run 10m sudo -E PATH="$PATH" bash -c './cluster/up.sh' export KUBECONFIG=/etc/kubernetes/admin.conf echo "INFO: Build and install the operator" step_install_operator=1 - sudo -E PATH="$PATH" bash -c './operator.sh' + run 20m sudo -E PATH="$PATH" bash -c './operator.sh' echo "INFO: Run tests" - cmd="sudo -E PATH=\"$PATH\" bash -c " + cmd="run 20m sudo -E PATH=\"$PATH\" bash -c " if [ -z "$runtimeclass" ]; then cmd+="'./tests_runner.sh'" else From db80d13cacf1a74b5ff8df38da11a7f402ed9fb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 30 Jan 2024 14:23:32 +0100 Subject: [PATCH 2/5] tests.e2e: Add 10m timeout for each bats test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ensure each test won't take longer than expected treshold by setting a 10m timeout. Note one can only set one timeout for all tests within a single file. Signed-off-by: Lukáš Doktor --- tests/e2e/operator_tests.bats | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/e2e/operator_tests.bats b/tests/e2e/operator_tests.bats index bea771c0..0cc7449c 100644 --- a/tests/e2e/operator_tests.bats +++ b/tests/e2e/operator_tests.bats @@ -8,6 +8,9 @@ load "${BATS_TEST_DIRNAME}/lib.sh" test_tag="[cc][operator]" +# Set 10m timeout for each test +export BATS_TEST_TIMEOUT=600 + is_operator_installed() { [ "$(kubectl get deployment -n "$ns" --no-headers 2>/dev/null | wc -l)" \ -gt 0 ] From 62d67be442141d7d0bf25b4bc71cced9011546cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 30 Jan 2024 16:11:36 +0100 Subject: [PATCH 3/5] tests.e2e: Ensure we don't skip on kubectl failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recent issues in CI indicate that kubectl might sometimes fail which results in wait_for_process interrupting the loop. Let's improve the command to ensure kubectl command passed and only then grep for the (un)expected output. Note the positive commands do not need to be treated as the output should not contain the pod names on failure. Fixes: #339 Signed-off-by: Lukáš Doktor --- tests/e2e/operator.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/e2e/operator.sh b/tests/e2e/operator.sh index 3503ecd7..8cc9ab3c 100755 --- a/tests/e2e/operator.sh +++ b/tests/e2e/operator.sh @@ -164,9 +164,9 @@ uninstall_ccruntime() { popd >/dev/null # Wait and ensure ccruntime pods are gone - # - local cmd="! sudo -E kubectl get pods -n $op_ns |" - cmd+="grep -q -e cc-operator-daemon-install" + # (ensure failing kubectl keeps iterating) + local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')" + cmd+=" && ! echo \$_OUT | grep -q -e cc-operator-daemon-install" cmd+=" -e cc-operator-pre-install-daemon" if ! wait_for_process 720 30 "$cmd"; then echo "ERROR: there are ccruntime pods still running" @@ -242,10 +242,9 @@ uninstall_operator() { popd >/dev/null # Wait and ensure the controller pod is gone - # - local pod="cc-operator-controller-manager" - local cmd="! kubectl get pods -n $op_ns |" - cmd+="grep -q $pod" + # (ensure failing kubectl keeps iterating) + local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')" + cmd+="&& ! echo \$_OUT | grep -q -e cc-operator-controller-manager" if ! wait_for_process 180 30 "$cmd"; then echo "ERROR: the controller manager is still running" From 5d4895d7a173afea73f658bbc7b0dafeda146236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 30 Jan 2024 16:14:23 +0100 Subject: [PATCH 4/5] tests.e2e: Log an error message after unsuccessful run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On test failure we might still execute a cleanup that spils a bunch of text making it not obvious whether the testing passed or failed. Note the return code is already fine, this change is only for the users to better notice things didn't went well. Signed-off-by: Lukáš Doktor --- tests/e2e/run-local.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh index ae3dd826..17cba136 100755 --- a/tests/e2e/run-local.sh +++ b/tests/e2e/run-local.sh @@ -76,9 +76,12 @@ undo_changes() { } on_exit() { + RET="$?" if [ "$undo" == "true" ]; then + [ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET, starting the cleanup..." undo_changes fi + [ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET" || echo "INFO: Testing passed" } trap on_exit EXIT From 2d78020b3a84cda3215aabaaecc8d5fe9f467537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= Date: Tue, 30 Jan 2024 16:22:23 +0100 Subject: [PATCH 5/5] tests.e2e: Use github workflow commands for logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the simple "DEBUG|ERROR|INFO" prefixes with the github action commands "::debug::" as it should improve the GH logs readability while leaving the bash outputs still parsable by humans. Signed-off-by: Lukáš Doktor --- tests/e2e/operator.sh | 36 ++++++++++++++++++------------------ tests/e2e/run-local.sh | 20 ++++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/e2e/operator.sh b/tests/e2e/operator.sh index 8cc9ab3c..601c808d 100755 --- a/tests/e2e/operator.sh +++ b/tests/e2e/operator.sh @@ -32,10 +32,10 @@ build_operator () { # so it's better to check it before adding the target repo. local sd="$(git config --global --get safe.directory ${project_dir} || true)" if [ "${sd}" == "" ]; then - echo "Add repo ${project_dir} to git's safe.directory" + echo "::debug:: Add repo ${project_dir} to git's safe.directory" git config --global --add safe.directory "${project_dir}" else - echo "Repo ${project_dir} already in git's safe.directory" + echo "::debug:: Repo ${project_dir} already in git's safe.directory" fi pushd "$project_dir" >/dev/null @@ -67,9 +67,9 @@ build_pre_install_img() { handle_older_containerd() { command -v containerd >/dev/null || return local version=$(containerd -v | awk '{ print $3 }' | sed 's/^v//') - echo "system's containerd version: $version" + echo "::debug:: system's containerd version: $version" if [[ "$version" =~ ^1.6 || "$version" =~ ^1.5 ]]; then - echo "Old system's containerd ($version). Configuring the operator to install a newer one" + echo "::warning:: Old system's containerd ($version). Configuring the operator to install a newer one" pushd "$project_dir" >/dev/null for kfile in $(find config/ -name "kustomization.yaml" \ -exec grep -l INSTALL_OFFICIAL_CONTAINERD {} \;);do @@ -104,10 +104,10 @@ install_operator() { local cmd="kubectl get pods -n "$op_ns" --no-headers |" cmd+="egrep -q ${controller_pod}.*'\'" if ! wait_for_process 120 10 "$cmd"; then - echo "ERROR: ${controller_pod} pod is not running" + echo "::error:: ${controller_pod} pod is not running" local pod_id="$(get_pods_regex $controller_pod $op_ns)" - echo "DEBUG: Pod $pod_id" + echo "::debug:: Pod $pod_id" debug_pod "$pod_id" "$op_ns" return 1 @@ -135,10 +135,10 @@ install_ccruntime() { cmd="kubectl get pods -n "$op_ns" --no-headers |" cmd+="egrep -q ${pod}.*'\'" if ! wait_for_process 600 30 "$cmd"; then - echo "ERROR: $pod pod is not running" + echo "::error:: $pod pod is not running" local pod_id="$(get_pods_regex $pod $op_ns)" - echo "DEBUG: Pod $pod_id" + echo "::debug:: Pod $pod_id" debug_pod "$pod_id" "$op_ns" return 1 @@ -149,7 +149,7 @@ install_ccruntime() { # There could be a case where it is not even if the pods above are running. cmd="kubectl get runtimeclass | grep -q ${runtimeclass}" if ! wait_for_process 300 30 "$cmd"; then - echo "ERROR: runtimeclass ${runtimeclass} is not up" + echo "::error:: runtimeclass ${runtimeclass} is not up" return 1 fi # To keep operator running, we should resume registry stopped during containerd restart. @@ -169,7 +169,7 @@ uninstall_ccruntime() { cmd+=" && ! echo \$_OUT | grep -q -e cc-operator-daemon-install" cmd+=" -e cc-operator-pre-install-daemon" if ! wait_for_process 720 30 "$cmd"; then - echo "ERROR: there are ccruntime pods still running" + echo "::error:: there are ccruntime pods still running" echo "::group::Describe pods from $op_ns namespace" kubectl -n "$op_ns" describe pods || true echo "::endgroup::" @@ -183,7 +183,7 @@ uninstall_ccruntime() { # Labels should be gone if kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}' | \ grep -q -e cc-preinstall -e katacontainers.io; then - echo "ERROR: there are labels left behind" + echo "::error:: there are labels left behind" kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}' return 1 @@ -207,7 +207,7 @@ kustomization_set_image() { # and this can introduce false-positive on the tests. So let's check the old image really # exist. if ! grep -q "name: ${old}$" ./kustomization.yaml; then - echo "ERROR: expected image ${old} in ${overlay_dir}/kustomization.yaml" + echo "::error:: expected image ${old} in ${overlay_dir}/kustomization.yaml" return 1 fi @@ -246,10 +246,10 @@ uninstall_operator() { local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')" cmd+="&& ! echo \$_OUT | grep -q -e cc-operator-controller-manager" if ! wait_for_process 180 30 "$cmd"; then - echo "ERROR: the controller manager is still running" + echo "::error:: the controller manager is still running" local pod_id="$(get_pods_regex $pod $op_ns)" - echo "DEBUG: Pod $pod_id" + echo "::debug:: Pod $pod_id" debug_pod "$pod_id" "$op_ns" return 1 @@ -268,7 +268,7 @@ wait_for_stabilization() { while read -r pod container restart_count; do if [ "${restart_counts[$pod-$container]--1}" != "$restart_count" ]; then - echo "DEBUG: Pod: $pod, Container: $container, Restart count: $restart_count" + echo "::debug:: Pod: $pod, Container: $container, Restart count: $restart_count" restart_counts["$pod-$container"]=$restart_count change=1 fi @@ -277,10 +277,10 @@ wait_for_stabilization() { [ $change -eq 0 ] && ((iteration+=1)) if [ $iteration -gt 3 ]; then - echo "INFO: No new restarts in 3x21s, proceeding..." + echo "::info:: No new restarts in 3x21s, proceeding..." break elif [ $count -gt 20 ]; then - echo "ERROR: Pods are still restarting after 20x21s, bailing out!" + echo "::error:: Pods are still restarting after 20x21s, bailing out!" return 1 fi @@ -334,7 +334,7 @@ main() { wait_for_stabilization ;; *) - echo "Unknown command '$1'" + echo "::error:: Unknown command '$1'" usage && exit 1 esac fi diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh index 17cba136..38bde695 100755 --- a/tests/e2e/run-local.sh +++ b/tests/e2e/run-local.sh @@ -59,17 +59,17 @@ undo_changes() { pushd "$script_dir" >/dev/null # Do not try to undo steps that did not execute. if [ $step_install_operator -eq 1 ]; then - echo "INFO: Uninstall the operator" + echo "::info:: Uninstall the operator" run 10m sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true fi if [ $step_start_cluster -eq 1 ]; then - echo "INFO: Shutdown the cluster" + echo "::info:: Shutdown the cluster" run 5m sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true fi if [ $step_bootstrap_env -eq 1 ]; then - echo "INFO: Undo the bootstrap" + echo "::info:: Undo the bootstrap" run 5m ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true fi popd >/dev/null @@ -78,10 +78,10 @@ undo_changes() { on_exit() { RET="$?" if [ "$undo" == "true" ]; then - [ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET, starting the cleanup..." + [ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET, starting the cleanup..." undo_changes fi - [ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET" || echo "INFO: Testing passed" + [ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET" || echo "::info:: Testing passed" } trap on_exit EXIT @@ -93,27 +93,27 @@ main() { # Check Ansible is installed. if ! command -v ansible-playbook >/dev/null; then - echo "ERROR: ansible-playbook is required to run this script." + echo "::error:: ansible-playbook is required to run this script." exit 1 fi export "PATH=$PATH:/usr/local/bin" pushd "$script_dir" >/dev/null - echo "INFO: Bootstrap the local machine" + echo "::info:: Bootstrap the local machine" step_bootstrap_env=1 run 10m ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml - echo "INFO: Bring up the test cluster" + echo "::info:: Bring up the test cluster" step_start_cluster=1 run 10m sudo -E PATH="$PATH" bash -c './cluster/up.sh' export KUBECONFIG=/etc/kubernetes/admin.conf - echo "INFO: Build and install the operator" + echo "::info:: Build and install the operator" step_install_operator=1 run 20m sudo -E PATH="$PATH" bash -c './operator.sh' - echo "INFO: Run tests" + echo "::info:: Run tests" cmd="run 20m sudo -E PATH=\"$PATH\" bash -c " if [ -z "$runtimeclass" ]; then cmd+="'./tests_runner.sh'"