From b72f8f1c326dfa640102b5039fe32d7d3c836c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 30 Jan 2024 14:22:10 +0100
Subject: [PATCH 1/5] tests.e2e: Add timeouts to individual steps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

we are seeing stalled jobs, let's ensure our steps won't take longer
than expected.

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 .github/workflows/ccruntime_e2e.yaml |  2 +-
 tests/e2e/run-local.sh               | 28 ++++++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ccruntime_e2e.yaml b/.github/workflows/ccruntime_e2e.yaml
index 788eb2f0..674480ee 100644
--- a/.github/workflows/ccruntime_e2e.yaml
+++ b/.github/workflows/ccruntime_e2e.yaml
@@ -62,7 +62,7 @@ jobs:
           if [ $RUNNING_INSTANCE = "s390x" ]; then
             args=""
           fi
-          ./run-local.sh -r "${{ matrix.runtimeclass }}" "${args}"
+          ./run-local.sh -t -r "${{ matrix.runtimeclass }}" "${args}"
         env:
           RUNNING_INSTANCE: ${{ matrix.instance }}
 
diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh
index bfbb780e..ae3dd826 100755
--- a/tests/e2e/run-local.sh
+++ b/tests/e2e/run-local.sh
@@ -15,6 +15,7 @@ step_start_cluster=0
 step_install_operator=0
 runtimeclass=""
 undo="false"
+timeout="false"
 
 usage() {
 	cat <<-EOF
@@ -29,36 +30,47 @@ usage() {
                          the tests. Defaults to "kata-qemu".
 	-u: undo the installation and configuration before exiting. Useful for
 	    baremetal machine were it needs to do clean up for the next tests.
+	-t: enable default timeout for each operation (useful for CI)
 	EOF
 }
 
 parse_args() {
-	while getopts "hr:u" opt; do
+	while getopts "hr:ut" opt; do
 		case $opt in
 			h) usage && exit 0;;
 			r) runtimeclass="$OPTARG";;
 			u) undo="true";;
+			t) timeout="true";;
 			*) usage && exit 1;;
 		esac
 	done
 }
 
+run() {
+	duration=$1; shift
+	if [ "$timeout" == "true" ]; then
+		timeout $duration "$@"
+	else
+		"$@"
+	fi
+}
+
 undo_changes() {
 	pushd "$script_dir" >/dev/null
 	# Do not try to undo steps that did not execute.
 	if [ $step_install_operator -eq 1 ]; then
 		echo "INFO: Uninstall the operator"
-		sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true
+		run 10m sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true
 	fi
 
 	if [ $step_start_cluster -eq 1 ]; then
 		echo "INFO: Shutdown the cluster"
-		sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true
+		run 5m sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true
 	fi
 
 	if [ $step_bootstrap_env -eq 1 ]; then
 		echo "INFO: Undo the bootstrap"
-		ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true
+		run 5m ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true
 	fi
 	popd >/dev/null
 }
@@ -87,19 +99,19 @@ main() {
 	pushd "$script_dir" >/dev/null
 	echo "INFO: Bootstrap the local machine"
 	step_bootstrap_env=1
-	ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml
+	run 10m ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml
 
 	echo "INFO: Bring up the test cluster"
 	step_start_cluster=1
-	sudo -E PATH="$PATH" bash -c './cluster/up.sh'
+	run 10m sudo -E PATH="$PATH" bash -c './cluster/up.sh'
 	export KUBECONFIG=/etc/kubernetes/admin.conf
 
 	echo "INFO: Build and install the operator"
 	step_install_operator=1
-	sudo -E PATH="$PATH" bash -c './operator.sh'
+	run 20m sudo -E PATH="$PATH" bash -c './operator.sh'
 
 	echo "INFO: Run tests"
-	cmd="sudo -E PATH=\"$PATH\" bash -c "
+	cmd="run 20m sudo -E PATH=\"$PATH\" bash -c "
 	if [ -z "$runtimeclass" ]; then
 		cmd+="'./tests_runner.sh'"
 	else

From db80d13cacf1a74b5ff8df38da11a7f402ed9fb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 30 Jan 2024 14:23:32 +0100
Subject: [PATCH 2/5] tests.e2e: Add 10m timeout for each bats test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ensure each test won't take longer than expected treshold by setting a
10m timeout. Note one can only set one timeout for all tests within a
single file.

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/e2e/operator_tests.bats | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/e2e/operator_tests.bats b/tests/e2e/operator_tests.bats
index bea771c0..0cc7449c 100644
--- a/tests/e2e/operator_tests.bats
+++ b/tests/e2e/operator_tests.bats
@@ -8,6 +8,9 @@
 load "${BATS_TEST_DIRNAME}/lib.sh"
 test_tag="[cc][operator]"
 
+# Set 10m timeout for each test
+export BATS_TEST_TIMEOUT=600
+
 is_operator_installed() {
 	[ "$(kubectl get deployment -n "$ns" --no-headers 2>/dev/null | wc -l)" \
 		-gt 0 ]

From 62d67be442141d7d0bf25b4bc71cced9011546cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 30 Jan 2024 16:11:36 +0100
Subject: [PATCH 3/5] tests.e2e: Ensure we don't skip on kubectl failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

recent issues in CI indicate that kubectl might sometimes fail which
results in wait_for_process interrupting the loop. Let's improve the
command to ensure kubectl command passed and only then grep for the
(un)expected output.

Note the positive commands do not need to be treated as the output
should not contain the pod names on failure.

Fixes: #339

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/e2e/operator.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/e2e/operator.sh b/tests/e2e/operator.sh
index 3503ecd7..8cc9ab3c 100755
--- a/tests/e2e/operator.sh
+++ b/tests/e2e/operator.sh
@@ -164,9 +164,9 @@ uninstall_ccruntime() {
 	popd >/dev/null
 
 	# Wait and ensure ccruntime pods are gone
-	#
-	local cmd="! sudo -E kubectl get pods -n $op_ns |"
-	cmd+="grep -q -e cc-operator-daemon-install"
+	# (ensure failing kubectl keeps iterating)
+	local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')"
+	cmd+=" && ! echo \$_OUT | grep -q -e cc-operator-daemon-install"
 	cmd+=" -e cc-operator-pre-install-daemon"
 	if ! wait_for_process 720 30 "$cmd"; then
 		echo "ERROR: there are ccruntime pods still running"
@@ -242,10 +242,9 @@ uninstall_operator() {
 	popd >/dev/null
 
 	# Wait and ensure the controller pod is gone
-	#
-	local pod="cc-operator-controller-manager"
-	local cmd="! kubectl get pods -n $op_ns |"
-	cmd+="grep -q $pod"
+	# (ensure failing kubectl keeps iterating)
+	local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')"
+	cmd+="&& ! echo \$_OUT | grep -q -e cc-operator-controller-manager"
 	if ! wait_for_process 180 30 "$cmd"; then
 		echo "ERROR: the controller manager is still running"
 

From 5d4895d7a173afea73f658bbc7b0dafeda146236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 30 Jan 2024 16:14:23 +0100
Subject: [PATCH 4/5] tests.e2e: Log an error message after unsuccessful run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On test failure we might still execute a cleanup that spils a bunch of
text making it not obvious whether the testing passed or failed. Note
the return code is already fine, this change is only for the users to
better notice things didn't went well.

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/e2e/run-local.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh
index ae3dd826..17cba136 100755
--- a/tests/e2e/run-local.sh
+++ b/tests/e2e/run-local.sh
@@ -76,9 +76,12 @@ undo_changes() {
 }
 
 on_exit() {
+	RET="$?"
 	if [ "$undo" == "true" ]; then
+		[ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET, starting the cleanup..."
 		undo_changes
 	fi
+	[ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET" || echo "INFO: Testing passed"
 }
 
 trap on_exit EXIT

From 2d78020b3a84cda3215aabaaecc8d5fe9f467537 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 30 Jan 2024 16:22:23 +0100
Subject: [PATCH 5/5] tests.e2e: Use github workflow commands for logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the simple "DEBUG|ERROR|INFO" prefixes with the github action
commands "::debug::" as it should improve the GH logs readability while
leaving the bash outputs still parsable by humans.

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 tests/e2e/operator.sh  | 36 ++++++++++++++++++------------------
 tests/e2e/run-local.sh | 20 ++++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/tests/e2e/operator.sh b/tests/e2e/operator.sh
index 8cc9ab3c..601c808d 100755
--- a/tests/e2e/operator.sh
+++ b/tests/e2e/operator.sh
@@ -32,10 +32,10 @@ build_operator () {
 	# so it's better to check it before adding the target repo.
 	local sd="$(git config --global --get safe.directory ${project_dir} || true)"
 	if [ "${sd}" == "" ]; then
-		echo "Add repo ${project_dir} to git's safe.directory"
+		echo "::debug:: Add repo ${project_dir} to git's safe.directory"
 		git config --global --add safe.directory "${project_dir}"
 	else
-		echo "Repo ${project_dir} already in git's safe.directory"
+		echo "::debug:: Repo ${project_dir} already in git's safe.directory"
 	fi
 
 	pushd "$project_dir" >/dev/null
@@ -67,9 +67,9 @@ build_pre_install_img() {
 handle_older_containerd() {
 	command -v containerd >/dev/null || return
 	local version=$(containerd -v | awk '{ print $3 }' | sed 's/^v//')
-	echo "system's containerd version: $version"
+	echo "::debug:: system's containerd version: $version"
 	if [[ "$version" =~ ^1.6 || "$version" =~ ^1.5 ]]; then
-		echo "Old system's containerd ($version). Configuring the operator to install a newer one"
+		echo "::warning:: Old system's containerd ($version). Configuring the operator to install a newer one"
 		pushd "$project_dir" >/dev/null
 		for kfile in $(find config/ -name "kustomization.yaml" \
 			-exec grep -l INSTALL_OFFICIAL_CONTAINERD {} \;);do
@@ -104,10 +104,10 @@ install_operator() {
 	local cmd="kubectl get pods -n "$op_ns" --no-headers |"
 	cmd+="egrep -q ${controller_pod}.*'\<Running\>'"
 	if ! wait_for_process 120 10 "$cmd"; then
-		echo "ERROR: ${controller_pod} pod is not running"
+		echo "::error:: ${controller_pod} pod is not running"
 
 		local pod_id="$(get_pods_regex $controller_pod $op_ns)"
-		echo "DEBUG: Pod $pod_id"
+		echo "::debug:: Pod $pod_id"
 		debug_pod "$pod_id" "$op_ns"
 
 		return 1
@@ -135,10 +135,10 @@ install_ccruntime() {
 		cmd="kubectl get pods -n "$op_ns" --no-headers |"
 		cmd+="egrep -q ${pod}.*'\<Running\>'"
 		if ! wait_for_process 600 30 "$cmd"; then
-			echo "ERROR: $pod pod is not running"
+			echo "::error:: $pod pod is not running"
 
 			local pod_id="$(get_pods_regex $pod $op_ns)"
-			echo "DEBUG: Pod $pod_id"
+			echo "::debug:: Pod $pod_id"
 			debug_pod "$pod_id" "$op_ns"
 
 			return 1
@@ -149,7 +149,7 @@ install_ccruntime() {
 	# There could be a case where it is not even if the pods above are running.
 	cmd="kubectl get runtimeclass | grep -q ${runtimeclass}"
 	if ! wait_for_process 300 30 "$cmd"; then
-		echo "ERROR: runtimeclass ${runtimeclass} is not up"
+		echo "::error:: runtimeclass ${runtimeclass} is not up"
 		return 1
 	fi
 	# To keep operator running, we should resume registry stopped during containerd restart.
@@ -169,7 +169,7 @@ uninstall_ccruntime() {
 	cmd+=" && ! echo \$_OUT | grep -q -e cc-operator-daemon-install"
 	cmd+=" -e cc-operator-pre-install-daemon"
 	if ! wait_for_process 720 30 "$cmd"; then
-		echo "ERROR: there are ccruntime pods still running"
+		echo "::error:: there are ccruntime pods still running"
 		echo "::group::Describe pods from $op_ns namespace"
 		kubectl -n "$op_ns" describe pods || true
 		echo "::endgroup::"
@@ -183,7 +183,7 @@ uninstall_ccruntime() {
 	# Labels should be gone
 	if kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}' | \
 		grep -q -e cc-preinstall -e katacontainers.io; then
-		echo "ERROR: there are labels left behind"
+		echo "::error:: there are labels left behind"
 		kubectl get nodes "$SAFE_HOST_NAME" -o jsonpath='{.metadata.labels}'
 
 		return 1
@@ -207,7 +207,7 @@ kustomization_set_image() {
 	# and this can introduce false-positive on the tests. So let's check the old image really
 	# exist.
 	if ! grep -q "name: ${old}$" ./kustomization.yaml; then
-		echo "ERROR: expected image ${old} in ${overlay_dir}/kustomization.yaml"
+		echo "::error:: expected image ${old} in ${overlay_dir}/kustomization.yaml"
 		return 1
 	fi
 
@@ -246,10 +246,10 @@ uninstall_operator() {
 	local cmd="_OUT=\$(sudo -E kubectl get pods -n '$op_ns')"
 	cmd+="&& ! echo \$_OUT | grep -q -e cc-operator-controller-manager"
 	if ! wait_for_process 180 30 "$cmd"; then
-		echo "ERROR: the controller manager is still running"
+		echo "::error:: the controller manager is still running"
 
 		local pod_id="$(get_pods_regex $pod $op_ns)"
-		echo "DEBUG: Pod $pod_id"
+		echo "::debug:: Pod $pod_id"
 		debug_pod "$pod_id" "$op_ns"
 
 		return 1
@@ -268,7 +268,7 @@ wait_for_stabilization() {
 
 		while read -r pod container restart_count; do
 			if [ "${restart_counts[$pod-$container]--1}" != "$restart_count" ]; then
-				echo "DEBUG: Pod: $pod, Container: $container, Restart count: $restart_count"
+				echo "::debug:: Pod: $pod, Container: $container, Restart count: $restart_count"
 				restart_counts["$pod-$container"]=$restart_count
 				change=1
 			fi
@@ -277,10 +277,10 @@ wait_for_stabilization() {
 		[ $change -eq 0 ] && ((iteration+=1))
 
 		if [ $iteration -gt 3 ]; then
-			echo "INFO: No new restarts in 3x21s, proceeding..."
+			echo "::info:: No new restarts in 3x21s, proceeding..."
 			break
 		elif [ $count -gt 20 ]; then
-			echo "ERROR: Pods are still restarting after 20x21s, bailing out!"
+			echo "::error:: Pods are still restarting after 20x21s, bailing out!"
 			return 1
 		fi
 
@@ -334,7 +334,7 @@ main() {
 				wait_for_stabilization
 				;;
 			*)
-				echo "Unknown command '$1'"
+				echo "::error:: Unknown command '$1'"
 				usage && exit 1
 		esac
 	fi
diff --git a/tests/e2e/run-local.sh b/tests/e2e/run-local.sh
index 17cba136..38bde695 100755
--- a/tests/e2e/run-local.sh
+++ b/tests/e2e/run-local.sh
@@ -59,17 +59,17 @@ undo_changes() {
 	pushd "$script_dir" >/dev/null
 	# Do not try to undo steps that did not execute.
 	if [ $step_install_operator -eq 1 ]; then
-		echo "INFO: Uninstall the operator"
+		echo "::info:: Uninstall the operator"
 		run 10m sudo -E PATH="$PATH" bash -c './operator.sh uninstall' || true
 	fi
 
 	if [ $step_start_cluster -eq 1 ]; then
-		echo "INFO: Shutdown the cluster"
+		echo "::info:: Shutdown the cluster"
 		run 5m sudo -E PATH="$PATH" bash -c './cluster/down.sh' || true
 	fi
 
 	if [ $step_bootstrap_env -eq 1 ]; then
-		echo "INFO: Undo the bootstrap"
+		echo "::info:: Undo the bootstrap"
 		run 5m ansible-playbook -i localhost, -c local --tags undo ansible/main.yml || true
 	fi
 	popd >/dev/null
@@ -78,10 +78,10 @@ undo_changes() {
 on_exit() {
 	RET="$?"
 	if [ "$undo" == "true" ]; then
-		[ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET, starting the cleanup..."
+		[ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET, starting the cleanup..."
 		undo_changes
 	fi
-	[ "$RET" -ne 0 ] && echo && echo "ERROR: Testing failed with $RET" || echo "INFO: Testing passed"
+	[ "$RET" -ne 0 ] && echo && echo "::error:: Testing failed with $RET" || echo "::info:: Testing passed"
 }
 
 trap on_exit EXIT
@@ -93,27 +93,27 @@ main() {
 
 	# Check Ansible is installed.
 	if ! command -v ansible-playbook >/dev/null; then
-		echo "ERROR: ansible-playbook is required to run this script."
+		echo "::error:: ansible-playbook is required to run this script."
 		exit 1
 	fi
 
 	export "PATH=$PATH:/usr/local/bin"
 
 	pushd "$script_dir" >/dev/null
-	echo "INFO: Bootstrap the local machine"
+	echo "::info:: Bootstrap the local machine"
 	step_bootstrap_env=1
 	run 10m ansible-playbook -i localhost, -c local --tags untagged ansible/main.yml
 
-	echo "INFO: Bring up the test cluster"
+	echo "::info:: Bring up the test cluster"
 	step_start_cluster=1
 	run 10m sudo -E PATH="$PATH" bash -c './cluster/up.sh'
 	export KUBECONFIG=/etc/kubernetes/admin.conf
 
-	echo "INFO: Build and install the operator"
+	echo "::info:: Build and install the operator"
 	step_install_operator=1
 	run 20m sudo -E PATH="$PATH" bash -c './operator.sh'
 
-	echo "INFO: Run tests"
+	echo "::info:: Run tests"
 	cmd="run 20m sudo -E PATH=\"$PATH\" bash -c "
 	if [ -z "$runtimeclass" ]; then
 		cmd+="'./tests_runner.sh'"