From a7e4b5a467e997e239ed18d0625686560ef8500d Mon Sep 17 00:00:00 2001
From: Chris Fallin <chris@cfallin.org>
Date: Sat, 5 Feb 2022 23:00:28 -0800
Subject: [PATCH] Patch qemu in CI to fix madvise semantics.

We currently skip some tests when running our qemu-based tests for
aarch64 and s390x. Qemu has broken madvise(MADV_DONTNEED) semantics --
specifically, it just ignores madvise() [1].

We could continue to whack-a-mole the tests whenever we create new
functionality that relies on madvise() semantics, but ideally we'd just
have emulation that properly emulates!

The earlier discussions on the qemu mailing list [2] had a proposed
patch for this, but (i) this patch doesn't seem to apply cleanly anymore
(it's 3.5 years old) and (ii) it's pretty complex due to the need to
handle qemu's ability to emulate differing page sizes on host and guest.

It turns out that we only really need this for CI when host and guest
have the same page size (4KiB), so we *could* just pass the madvise()s
through. I wouldn't expect such a patch to ever land upstream in qemu,
but it satisfies our needs I think. So this PR modifies our CI setup to
patch qemu before building it locally with a little one-off patch.

[1]
https://github.com/bytecodealliance/wasmtime/pull/2518#issuecomment-747280133

[2]
https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg05416.html
---
 .github/workflows/main.yml                    |  3 +-
 ci/qemu-madvise.patch                         | 61 +++++++++++++++++++
 .../runtime/src/instance/allocator/pooling.rs |  5 --
 crates/runtime/src/memfd.rs                   | 23 -------
 4 files changed, 63 insertions(+), 29 deletions(-)
 create mode 100644 ci/qemu-madvise.patch

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 8516d4ec0a47..7e7b71e77822 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -253,7 +253,7 @@ jobs:
     - uses: actions/cache@v2
       with:
         path: ${{ runner.tool_cache }}/qemu
-        key: qemu-${{ matrix.target }}-${{ env.QEMU_BUILD_VERSION }}
+        key: qemu-${{ matrix.target }}-${{ env.QEMU_BUILD_VERSION }}-patchmadvise2
       if: matrix.target != '' && matrix.os == 'ubuntu-latest'
     - name: Install cross-compilation tools
       run: |
@@ -286,6 +286,7 @@ jobs:
         # quickly.
         curl https://download.qemu.org/qemu-$QEMU_BUILD_VERSION.tar.xz | tar xJf -
         cd qemu-$QEMU_BUILD_VERSION
+        patch -p1 < $GITHUB_WORKSPACE/ci/qemu-madvise.patch
         ./configure --target-list=${{ matrix.qemu_target }} --prefix=${{ runner.tool_cache}}/qemu --disable-tools --disable-slirp --disable-fdt --disable-capstone --disable-docs
         ninja -C build install
         touch ${{ runner.tool_cache }}/qemu/built
diff --git a/ci/qemu-madvise.patch b/ci/qemu-madvise.patch
new file mode 100644
index 000000000000..d2eb9fcd33e1
--- /dev/null
+++ b/ci/qemu-madvise.patch
@@ -0,0 +1,61 @@
+From 1ec3de1634195a4d4410cc33fdc66c68057e16a3 Mon Sep 17 00:00:00 2001
+From: Chris Fallin <chris@cfallin.org>
+Date: Sat, 5 Feb 2022 22:45:58 -0800
+Subject: [PATCH] Emulate Linux madvise() properly when possible.
+
+Curently madvise() is not emulated for Linux targets because it is not
+trivial to emulate when the guest and host page sizes differ -- in this
+case, mmap()s are not passed straight through, so the semantics of
+various MADV_* flags are not trivial to replicate.
+
+However, if the guest and host are both Linux, and the page sizes are
+the same on both ends (which is often the case, e.g. 4KiB for x86-64,
+aarch64, s390x, and possibly others), then the mmap()s are in fact
+passed straight through. Furthermore, the MADV_* flags are defined in
+target-independent headers, so we can pass the base, length, and
+`advice` arugments to `madvise()` straight through.
+
+This patch alters the Linux-userspace syscall emulation to do just that,
+passing through the `madvise()` calls when possible and returning
+`EINVAL` otherwise so the guest is properly informed that the desired
+semantics (e.g., MADV_DONTNEED to clear memory) are not available.
+---
+ linux-user/syscall.c | 22 ++++++++++++++++------
+ 1 file changed, 16 insertions(+), 6 deletions(-)
+
+diff --git a/linux-user/syscall.c b/linux-user/syscall.c
+index 5950222a77..836e39df5f 100644
+--- a/linux-user/syscall.c
++++ b/linux-user/syscall.c
+@@ -11853,12 +11853,22 @@ static abi_long do_syscall1(void *cpu_env, int num, abi_long arg1,
+ 
+ #ifdef TARGET_NR_madvise
+     case TARGET_NR_madvise:
+-        /* A straight passthrough may not be safe because qemu sometimes
+-           turns private file-backed mappings into anonymous mappings.
+-           This will break MADV_DONTNEED.
+-           This is a hint, so ignoring and returning success is ok.  */
+-        return 0;
+-#endif
++#ifdef __linux__
++        /* If the host is Linux, and the guest and host page sizes are the
++         * same, then mmaps will have been passed through one-to-one, so we can
++         * rely on the madvise semantics of the host. Note that the advice
++         * argument (arg3) is fully architecture-independent. */
++        if (TARGET_PAGE_SIZE == sysconf(_SC_PAGESIZE)) {
++            return get_errno(madvise(g2h_untagged(arg1), (size_t)arg2, (int)arg3));
++        } else {
++            return -TARGET_EINVAL;
++        }
++#else   // __linux__
++        /* We will not be able to emulate the Linux-specific semantics, so we
++         * raise an error. */
++        return -TARGET_EINVAL;
++#endif  // !__linux__
++#endif  // TARGET_NR_madvise
+ #ifdef TARGET_NR_fcntl64
+     case TARGET_NR_fcntl64:
+     {
+-- 
+2.34.1
+
diff --git a/crates/runtime/src/instance/allocator/pooling.rs b/crates/runtime/src/instance/allocator/pooling.rs
index 9990e797d819..bab7a47f3f9f 100644
--- a/crates/runtime/src/instance/allocator/pooling.rs
+++ b/crates/runtime/src/instance/allocator/pooling.rs
@@ -1748,11 +1748,6 @@ mod test {
     #[cfg(all(unix, target_pointer_width = "64", feature = "async"))]
     #[test]
     fn test_stack_zeroed() -> Result<()> {
-        // https://github.com/bytecodealliance/wasmtime/pull/2518#issuecomment-747280133
-        if std::env::var("WASMTIME_TEST_NO_HOG_MEMORY").is_ok() {
-            return Ok(());
-        }
-
         let allocator = PoolingInstanceAllocator::new(
             PoolingAllocationStrategy::NextAvailable,
             ModuleLimits {
diff --git a/crates/runtime/src/memfd.rs b/crates/runtime/src/memfd.rs
index cf0da3e0dff1..53ba2490c118 100644
--- a/crates/runtime/src/memfd.rs
+++ b/crates/runtime/src/memfd.rs
@@ -554,10 +554,6 @@ mod test {
 
     #[test]
     fn instantiate_no_image() {
-        if skip_tests_due_to_qemu_madvise_semantics() {
-            return;
-        }
-
         // 4 MiB mmap'd area, not accessible
         let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
         // Create a MemFdSlot on top of it
@@ -590,10 +586,6 @@ mod test {
 
     #[test]
     fn instantiate_image() {
-        if skip_tests_due_to_qemu_madvise_semantics() {
-            return;
-        }
-
         // 4 MiB mmap'd area, not accessible
         let mut mmap = Mmap::accessible_reserved(0, 4 << 20).unwrap();
         // Create a MemFdSlot on top of it
@@ -637,19 +629,4 @@ mod test {
         let slice = mmap.as_slice();
         assert_eq!(&[1, 2, 3, 4], &slice[4096..4100]);
     }
-
-    /// qemu's madvise implementation does not implement the
-    /// "flash-reset back to zero or CoW backing" semantics that Linux
-    /// does. Our CI setup uses qemu (in usermode-binary mode, not
-    /// whole-system mode) to run tests on aarch64 and s390x. We want
-    /// to skip these tests when under qemu, but not when someone is
-    /// developing natively on one of these architectures. So instead,
-    /// we dynamically detect an environment variable that our CI
-    /// setup sets.
-    ///
-    /// See `skip_pooling_allocator_tests()` in `tests/all/main.rs`
-    /// for more.
-    fn skip_tests_due_to_qemu_madvise_semantics() -> bool {
-        std::env::var("WASMTIME_TEST_NO_HOG_MEMORY").is_ok()
-    }
 }