fix: fluvio-test harness bug (#2790)

Addresses #2695 A race condition existed in the logic to evaluate whether or not tests run with `fluvio-test` passed, making it possible for a non-passing-test to appear to succeed. Also, tests that timed-out were erroneously set as passing. Note that this effect was obscured as default timeout of one-hour was higher than github action timeout. 1. Added test harness validation tests that "failed to fail" using previous logic. 2. Added `--expect-fail` and `--expect-timeout` flags for tests that are supposed to fail or timeout and added `--expect-timeout` to longevity tests. 3. Tightened up logic so as to eliminate race condition. **Note: This fix exposes genuine failing tests, so it will not consistently pass CI pipeline.** ### validate-test-harness example output (ignoring stderr) ``` ./build-scripts/install_target.sh cargo build --bin fluvio-test -p fluvio-test cargo build --bin fluvio -p fluvio-cli cargo build --bin fluvio-run -p fluvio-run echo "clean up previous installation" clean up previous installation ./target/debug/fluvio cluster delete ./target/debug/fluvio-test expected_pass --local --cluster-start Start running fluvio test runner Starting cluster and testing connection remove cluster skipped installing cluster starting test in child process Creating the topic: unused topic "unused" created Starting example test that passes test complete, signaling to parent Test passed +----------+--------------+ | Pass? | true | |----------+--------------| | Duration | 827.659539ms | +----------+--------------+ ./target/debug/fluvio-test expected_fail --expect-fail Start running fluvio test runner Testing connection to Fluvio cluster in profile starting test in child process Creating the topic: unused topic "unused" already exists Starting example test that fails test failed as expected, signaling parent test complete, signaling to parent Test passed +----------+--------------+ | Pass? | true | |----------+--------------| | Duration | 2.731820019s | +----------+--------------+ ./target/debug/fluvio-test expected_fail_join_fail_first --expect-fail Start running fluvio test runner Testing connection to Fluvio cluster in profile starting test in child process Creating the topic: unused topic "unused" already exists Starting example test that fails test failed as expected, signaling parent test complete, signaling to parent Test passed +----------+--------------+ | Pass? | true | |----------+--------------| | Duration | 2.511258335s | +----------+--------------+ ./target/debug/fluvio-test expected_fail_join_success_first --expect-fail Start running fluvio test runner Testing connection to Fluvio cluster in profile starting test in child process Creating the topic: unused topic "unused" already exists Starting example test that fails test failed as expected, signaling parent test complete, signaling to parent Test passed +----------+--------------+ | Pass? | true | |----------+--------------| | Duration | 925.028775ms | +----------+--------------+ ./target/debug/fluvio-test expected_timeout --timeout 5sec --expect-timeout Start running fluvio test runner Testing connection to Fluvio cluster in profile starting test in child process Creating the topic: unused topic "unused" already exists Starting example test that timeouts Test timed out as expected after 5 seconds killing child test pid 1250657 name fluvio-test killing child test pid 1250686 name fluvio-test +----------+--------------+ | Pass? | true | |----------+--------------| | Duration | 5.405322312s | +----------+--------------+ ```
infinyon · Nov 8, 2022 · 3a739ee · 3a739ee
1 parent 3adb7fb
commit 3a739ee
Show file tree

Hide file tree

Showing 11 changed files with 301 additions and 28 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -522,14 +522,15 @@ jobs:
       TEST_BIN: ~/bin/fluvio-test
       SERVER_LOG: fluvio=debug
     strategy:
- #     fail-fast: false
+      fail-fast: false
       matrix:
         os: [ubuntu-latest]
         rust-target: [x86_64-unknown-linux-musl]
         run: [r1]
         spu: [2]
         test:
           [
+            validate-test-harness,
             smoke-test,
             smoke-test-tls,
             smoke-test-at-most-once,
@@ -582,6 +583,11 @@ jobs:
           fluvio cluster start --local --develop --spu ${{ matrix.spu }} --rust-log ${{ env.SERVER_LOG }} ${{ env.TLS_ARGS }}
       - name: sleep
         run: sleep 15
+      - name: Validate test harness
+        if: matrix.test == 'validate-test-harness'
+        timeout-minutes: 5
+        run : |
+          make validate-test-harness
       - name: Run smoke-test
         if: matrix.test == 'smoke-test'
         timeout-minutes: 5
@@ -689,7 +695,7 @@ jobs:
       UNINSTALL: noclean
       SERVER_LOG: fluvio=debug
     strategy:
-  #    fail-fast: false
+      fail-fast: false
       matrix:
         os: [ubuntu-latest]
         rust-target: [x86_64-unknown-linux-musl]

diff --git a/.github/workflows/hourly.yml b/.github/workflows/hourly.yml
@@ -21,7 +21,7 @@ env:
 
 jobs:
   build_long_binaries:
-    name: Test buiild for ${{ matrix.binary }}  on (${{ matrix.os }})
+    name: Test build for ${{ matrix.binary }}  on (${{ matrix.os }})
     if: ${{ false }}
     runs-on: ${{ matrix.os }}
     strategy:
@@ -191,6 +191,7 @@ jobs:
             --topic ${{ matrix.topic }} \
             --topic-segment-size ${{ matrix.topic_segment_bytes }} \
             --topic-retention ${{ matrix.topic_retention }} \
+            --expect-timeout
             longevity
 
       ## If the test passed, then copy the data from cluster to store into artifacts

diff --git a/crates/fluvio-test-util/test_meta/environment.rs b/crates/fluvio-test-util/test_meta/environment.rs
@@ -264,7 +264,7 @@ pub struct EnvironmentSetup {
     #[clap(long)]
     pub disable_timeout: bool,
 
-    /// Global timeout for a test. Will report as fail when reached
+    /// Global timeout for a test. Will report as fail when reached (unless --expect-timeout)
     /// ex. 30s, 15m, 2h, 1w
     #[clap(long, default_value = "1h", value_parser=parse_duration)]
     pub timeout: Duration,
@@ -276,4 +276,12 @@ pub struct EnvironmentSetup {
     /// K8: use sc address
     #[clap(long)]
     pub proxy_addr: Option<String>,
+
+    /// Will report fail unless test times out
+    #[clap(long, conflicts_with = "expect_fail")]
+    pub expect_timeout: bool,
+
+    /// Expect a test to fail. (fail-> pass. pass or timeout -> fail)
+    #[clap(long, conflicts_with = "expect_timeout")]
+    pub expect_fail: bool,
 }
diff --git a/crates/fluvio-test/src/main.rs b/crates/fluvio-test/src/main.rs
@@ -84,9 +84,9 @@ fn run_test(
     let test_cluster_opts = TestCluster::new(environment.clone());
     let test_driver = TestDriver::new(Some(test_cluster_opts));
 
-    let ok_signal = Arc::new(AtomicBool::new(false));
+    let finished_signal = Arc::new(AtomicBool::new(false));
     let fail_signal = Arc::new(AtomicBool::new(false));
-    signal_hook::flag::register(signal_hook::consts::SIGUSR1, Arc::clone(&ok_signal))
+    signal_hook::flag::register(signal_hook::consts::SIGUSR1, Arc::clone(&finished_signal))
         .expect("fail to register ok signal hook");
     signal_hook::flag::register(signal_hook::consts::SIGUSR2, Arc::clone(&fail_signal))
         .expect("fail to register fail signal hook");
@@ -107,17 +107,26 @@ fn run_test(
             let status = std::panic::catch_unwind(AssertUnwindSafe(|| {
                 (test_meta.test_fn)(test_driver, test_case)
             }));
-            match status {
-                Ok(_) => {
-                    println!("test passed, signalling success to parent");
-                    root_process.kill_with(Signal::User1);
-                    process::exit(0)
-                }
-                Err(err) => {
-                    println!("test failed {:#?}, signalling parent", err);
-                    root_process.kill_with(Signal::User2);
-                    process::exit(1);
+            let parent_id = get_parent_pid();
+            debug!(
+                "catch_unwind. PID {:?}, root_pid {root_pid}, parent_id {parent_id}",
+                get_current_pid().unwrap(),
+            );
+            if parent_id == root_pid {
+                println!("test complete, signaling to parent");
+                root_process.kill_with(Signal::User1);
+            }
+            if let Err(err) = status {
+                if environment.expect_fail {
+                    println!("test failed as expected, signaling parent");
+                } else {
+                    println!("test failed {:#?}, signaling parent", err);
                 }
+                // This doesn't actually kill root_process, just sends it the signal
+                root_process.kill_with(Signal::User2);
+                process::exit(1);
+            } else {
+                process::exit(0);
             }
         }
         Err(_) => panic!("Fork failed"),
@@ -132,7 +141,7 @@ fn run_test(
     let mut timed_out = false;
 
     loop {
-        if ok_signal.load(Ordering::Relaxed) || fail_signal.load(Ordering::Relaxed) {
+        if finished_signal.load(Ordering::Relaxed) {
             debug!("signal received");
             break;
         }
@@ -148,29 +157,36 @@ fn run_test(
         }
     }
 
-    let ok = ok_signal.load(Ordering::Relaxed);
-    let fail = fail_signal.load(Ordering::Relaxed);
+    let success = fail_signal.load(Ordering::Relaxed) == environment.expect_fail;
 
-    debug!(ok, fail, "signal status");
+    debug!(success, "signal status");
 
     if timed_out {
-        println!("Test timed out after {} seconds", timeout.as_secs());
+        let success = if environment.expect_timeout {
+            println!(
+                "Test timed out as expected after {} seconds",
+                timeout.as_secs()
+            );
+            true
+        } else {
+            println!("Test timed out after {} seconds", timeout.as_secs());
+            false
+        };
         kill_child_processes(root_process);
         TestResult {
-            success: true,
+            success,
             duration: start.elapsed().unwrap(),
             ..std::default::Default::default()
         }
-    } else if ok {
+    } else if success {
         println!("Test passed");
         TestResult {
             success: true,
             duration: start.elapsed().unwrap(),
             ..std::default::Default::default()
         }
     } else {
-        println!("test failed, killing all child processes");
-        kill_child_processes(root_process);
+        println!("Test failed");
         TestResult {
             success: false,
             duration: start.elapsed().unwrap(),
@@ -277,6 +293,14 @@ fn create_spinning_indicator() -> Option<ProgressBar> {
     }
 }
 
+fn get_parent_pid() -> sysinfo::Pid {
+    let pid = get_current_pid().expect("Unable to get current pid");
+    let mut sys2 = System::new();
+    sys2.refresh_processes();
+    let current_process = sys2.process(pid).expect("Current process not found");
+    current_process.parent().expect("Parent process not found")
+}
+
 #[cfg(test)]
 mod tests {
     // CLI Tests

diff --git a/crates/fluvio-test/src/tests/expected_fail.rs b/crates/fluvio-test/src/tests/expected_fail.rs
@@ -0,0 +1,40 @@
+use std::any::Any;
+use std::time::Duration;
+
+use clap::Parser;
+use fluvio_future::timer::sleep;
+use fluvio_test_derive::fluvio_test;
+use fluvio_test_util::test_meta::{TestOption, TestCase};
+use fluvio_test_util::async_process;
+
+#[derive(Debug, Clone)]
+pub struct ExpectedFailTestCase {}
+
+impl From<TestCase> for ExpectedFailTestCase {
+    fn from(_test_case: TestCase) -> Self {
+        ExpectedFailTestCase {}
+    }
+}
+
+#[derive(Debug, Parser, Clone)]
+#[clap(name = "Fluvio Expected Fail Test")]
+pub struct ExpectedFailTestOption {}
+impl TestOption for ExpectedFailTestOption {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[fluvio_test(name = "expected_fail", topic = "unused")]
+pub fn run(mut test_driver: FluvioTestDriver, mut test_case: TestCase) {
+    println!("\nStarting example test that fails");
+
+    let fast_fail = async_process!(
+        async {
+            sleep(Duration::from_millis(2000)).await;
+            panic!("This test should fail");
+        },
+        "fast-fail"
+    );
+    fast_fail.join().unwrap();
+}
diff --git a/crates/fluvio-test/src/tests/expected_fail_join_fail_first.rs b/crates/fluvio-test/src/tests/expected_fail_join_fail_first.rs
@@ -0,0 +1,48 @@
+use std::any::Any;
+use std::time::Duration;
+
+use clap::Parser;
+use fluvio_future::timer::sleep;
+use fluvio_test_derive::fluvio_test;
+use fluvio_test_util::test_meta::{TestOption, TestCase};
+use fluvio_test_util::async_process;
+
+#[derive(Debug, Clone)]
+pub struct ExpectedFailJoinFailFirstTestCase {}
+
+impl From<TestCase> for ExpectedFailJoinFailFirstTestCase {
+    fn from(_test_case: TestCase) -> Self {
+        ExpectedFailJoinFailFirstTestCase {}
+    }
+}
+
+#[derive(Debug, Parser, Clone)]
+#[clap(name = "Fluvio Expected FailJoinFailFirst Test")]
+pub struct ExpectedFailJoinFailFirstTestOption {}
+impl TestOption for ExpectedFailJoinFailFirstTestOption {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[fluvio_test(name = r#"expected_fail_join_fail_first"#, topic = "unused")]
+pub fn run(mut test_driver: FluvioTestDriver, mut test_case: TestCase) {
+    println!("\nStarting example test that fails");
+
+    let success = async_process!(
+        async {
+            sleep(Duration::from_millis(2000)).await;
+        },
+        "success"
+    );
+
+    let fail = async_process!(
+        async {
+            sleep(Duration::from_millis(200)).await;
+            panic!("This test should fail");
+        },
+        "fail"
+    );
+    fail.join().unwrap();
+    success.join().unwrap();
+}
diff --git a/crates/fluvio-test/src/tests/expected_fail_join_success_first.rs b/crates/fluvio-test/src/tests/expected_fail_join_success_first.rs
@@ -0,0 +1,48 @@
+use std::any::Any;
+use std::time::Duration;
+
+use clap::Parser;
+use fluvio_future::timer::sleep;
+use fluvio_test_derive::fluvio_test;
+use fluvio_test_util::test_meta::{TestOption, TestCase};
+use fluvio_test_util::async_process;
+
+#[derive(Debug, Clone)]
+pub struct ExpectedFailJoinSuccessFirstTestCase {}
+
+impl From<TestCase> for ExpectedFailJoinSuccessFirstTestCase {
+    fn from(_test_case: TestCase) -> Self {
+        ExpectedFailJoinSuccessFirstTestCase {}
+    }
+}
+
+#[derive(Debug, Parser, Clone)]
+#[clap(name = "Fluvio Expected FailJoinSuccessFirst Test")]
+pub struct ExpectedFailJoinSuccessFirstTestOption {}
+impl TestOption for ExpectedFailJoinSuccessFirstTestOption {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[fluvio_test(name = "expected_fail_join_success_first", topic = "unused")]
+pub fn run(mut test_driver: FluvioTestDriver, mut test_case: TestCase) {
+    println!("\nStarting example test that fails");
+
+    let success = async_process!(
+        async {
+            sleep(Duration::from_millis(100)).await;
+        },
+        "success"
+    );
+
+    let fail = async_process!(
+        async {
+            sleep(Duration::from_millis(200)).await;
+            panic!("This test should fail");
+        },
+        "fail"
+    );
+    success.join().unwrap();
+    fail.join().unwrap();
+}
diff --git a/crates/fluvio-test/src/tests/expected_pass.rs b/crates/fluvio-test/src/tests/expected_pass.rs
@@ -0,0 +1,37 @@
+use std::any::Any;
+
+use clap::Parser;
+use fluvio_test_derive::fluvio_test;
+use fluvio_test_util::test_meta::{TestOption, TestCase};
+use fluvio_test_util::async_process;
+
+#[derive(Debug, Clone)]
+pub struct ExpectedPassTestCase {}
+
+impl From<TestCase> for ExpectedPassTestCase {
+    fn from(_test_case: TestCase) -> Self {
+        ExpectedPassTestCase {}
+    }
+}
+
+#[derive(Debug, Parser, Clone)]
+#[clap(name = "Fluvio Expected Fail Test")]
+pub struct ExpectedPassTestOption {}
+impl TestOption for ExpectedPassTestOption {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
+#[fluvio_test(name = "expected_pass", topic = "unused")]
+pub fn run(mut test_driver: FluvioTestDriver, mut test_case: TestCase) {
+    println!("\nStarting example test that passes");
+
+    let fast_success = async_process!(
+        async {
+            // Do nothing and exit
+        },
+        "fast-success"
+    );
+    fast_success.join().unwrap();
+}