From 64003d5f8a0ab09e48e56b3e7e246fb3875ade20 Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Tue, 27 Aug 2024 18:50:52 +0800
Subject: [PATCH 01/17] fpga: Add pcie XDMA framework Co-Author: xushuoxiang
 <xushx2022@shanghaitech.edu.cn>

---
 Makefile                          |  6 +++
 src/test/csrc/fpga/xdma.cpp       | 63 +++++++++++++++++++++++++++
 src/test/csrc/fpga/xdma.h         | 71 +++++++++++++++++++++++++++++++
 src/test/csrc/fpga/xdma_mpool.cpp |  0
 4 files changed, 140 insertions(+)
 create mode 100644 src/test/csrc/fpga/xdma.cpp
 create mode 100644 src/test/csrc/fpga/xdma.h
 create mode 100644 src/test/csrc/fpga/xdma_mpool.cpp

diff --git a/Makefile b/Makefile
index fe1a0edf6..04bd1054d 100644
--- a/Makefile
+++ b/Makefile
@@ -78,6 +78,12 @@ SIM_VSRC = $(shell find $(VSRC_DIR) -name "*.v" -or -name "*.sv")
 
 # DiffTest support
 DIFFTEST_CSRC_DIR = $(abspath ./src/test/csrc/difftest)
+# FPGA-Difftest support
+FPGA ?= 0
+ifeq ($(FPGA),1)
+DIFFTEST_CSRC_DIR += $(abspath ./src/test/csrc/fpga)
+endif
+
 DIFFTEST_CXXFILES = $(shell find $(DIFFTEST_CSRC_DIR) -name "*.cpp")
 ifeq ($(NO_DIFF), 1)
 SIM_CXXFLAGS += -DCONFIG_NO_DIFFTEST
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
new file mode 100644
index 000000000..f55f58ab5
--- /dev/null
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -0,0 +1,63 @@
+/***************************************************************************************
+* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
+* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
+*
+* DiffTest is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+#include "xdma.h"
+
+FpgaXdma::FpgaXdma() {
+  signal(SIGINT, handle_sigint);
+  fd_c2h = open("/dev/xdma0_c2h_0", O_RDWR);
+  set_dma_fd_block();
+}
+
+void FpgaXdma::handle_sigint(int sig) {
+  printf("Unlink sem success, exit success!\n");
+  exit(1);
+}
+
+void FpgaXdma::set_dma_fd_block() {
+  int flags = fcntl(fd, F_GETFL, 0);
+  if (flags == -1) {
+    perror("fcntl get error");
+    return;
+  }
+  // Clear the O NONBLOCK flag and set it to blocking mode
+  flags &= ~O_NONBLOCK;
+  if (fcntl(fd, F_SETFL, flags) == -1) {
+    perror("fcntl set error");
+    return;
+  }
+}
+
+void FpgaXdma::thread_read_xdma() {
+  while (running) {
+    char *memory = memory_pool.get_free_chunk();
+    read(fd_c2h, memory, recv_size);
+    memory_pool.set_busy_chunk();
+  }
+}
+
+void FpgaXdma::write_difftest_thread() {
+  while (running) {
+    const char *memory = memory_pool.get_busy_chunk();
+    memcpy(&diffteststate, memory, sizeof(diffteststate));
+
+    stream_receiver_cout ++;
+    memory_pool.set_free_chunk();
+
+// Notify difftest to run the next beat
+  
+
+  }
+}
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
new file mode 100644
index 000000000..223dda98d
--- /dev/null
+++ b/src/test/csrc/fpga/xdma.h
@@ -0,0 +1,71 @@
+/***************************************************************************************
+* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
+* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
+*
+* DiffTest is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+#ifndef __XDMA_H__
+#define __XDMA_H__
+
+#include "common.h"
+#include <queue>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/shm.h>
+#include <vector>
+
+#include "diffstate.h"
+
+#define MAX_DATA_LEN    1024 * 8 - 1
+#define HEAD_DATA_LEN   7
+#define BUFSIZE         1024 * 8 * 8
+#define SHMSZ           27
+#define WAIT_RECV_SLEEP 5
+
+typedef struct FpgaPackgeHead {
+  struct DiffTestState difftestinfo;
+  unsigned int sequence : 16;
+  unsigned int message_size : 16;
+  unsigned long data[HEAD_DATA_LEN];
+} FpgaPackgeHead;
+
+class FpgaXdma {
+public:
+  struct FpgaPackgeHead *shmadd_recv;
+
+  int shmid_recv;
+  int ret_recv;
+  key_t key_recv;
+
+  int fd_c2h;
+  int fd_interrupt;
+
+  struct FpgaPackgeHead recv_buffer;
+  unsigned long buffer[8];
+  unsigned int recv_size = sizeof(FpgaPackgeHead);
+  unsigned long old_exec_instr = 0;
+
+  FpgaXdma();
+  ~FpgaXdma() {};
+
+  void set_dma_fd_block();
+  void handle_sigint(int sig);
+  void read_xdma_thread();
+  void write_difftest_thread();
+
+protected:
+  std::mutex test_mtx;
+  std::condition_variable test_cv;
+};
+
+#endif
diff --git a/src/test/csrc/fpga/xdma_mpool.cpp b/src/test/csrc/fpga/xdma_mpool.cpp
new file mode 100644
index 000000000..e69de29bb

From 5a9d0270d197ad9e41185363935ee28fac6d7b72 Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Wed, 28 Aug 2024 10:54:16 +0800
Subject: [PATCH 02/17] fpga: add mpool

---
 src/test/csrc/fpga/mpool.cpp      | 72 +++++++++++++++++++++++++++
 src/test/csrc/fpga/mpool.h        | 83 +++++++++++++++++++++++++++++++
 src/test/csrc/fpga/xdma_mpool.cpp |  0
 3 files changed, 155 insertions(+)
 create mode 100644 src/test/csrc/fpga/mpool.cpp
 create mode 100644 src/test/csrc/fpga/mpool.h
 delete mode 100644 src/test/csrc/fpga/xdma_mpool.cpp

diff --git a/src/test/csrc/fpga/mpool.cpp b/src/test/csrc/fpga/mpool.cpp
new file mode 100644
index 000000000..0e6d2122c
--- /dev/null
+++ b/src/test/csrc/fpga/mpool.cpp
@@ -0,0 +1,72 @@
+/***************************************************************************************
+* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
+* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
+*
+* DiffTest is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+#include "mpool.h"
+
+void MemoryPool::init_memory_pool() {
+  memory_pool.reserve(NUM_BLOCKS);
+  for (size_t i = 0; i < NUM_BLOCKS; ++i) {
+    memory_pool.emplace_back();
+    block_mutexes[i].unlock();
+  }
+}
+
+void MemoryPool::cleanup_memory_pool() {
+  cv_empty.notify_all();
+  cv_filled.notify_all();
+  memory_pool.clear();
+}
+
+void MemoryPool::unlock_thread() {
+  cv_empty.notify_all();
+  cv_filled.notify_all();
+}
+
+char *MemoryPool::get_free_chunk() {
+  page_head = (write_index++) & REM_NUM_BLOCKS;
+  {
+    std::unique_lock<std::mutex> lock(block_mutexes[page_head]);
+    cv_empty.wait(lock, [this] { return empty_blocks > 0; });
+  }
+
+  --empty_blocks;
+  block_mutexes[page_head].lock();
+  return memory_pool[page_head].data.get();
+}
+
+void MemoryPool::set_busy_chunk() {
+  memory_pool[page_head].is_free = false;
+  block_mutexes[page_head].unlock();
+  cv_filled.notify_one();
+  ++filled_blocks;
+}
+
+const char *MemoryPool::get_busy_chunk() {
+  page_end = (read_index++) & REM_NUM_BLOCKS;
+  {
+    std::unique_lock<std::mutex> lock(block_mutexes[page_end]);
+    cv_filled.wait(lock, [this] { return filled_blocks > 0; });
+  }
+  --filled_blocks;
+  block_mutexes[page_end].lock();
+  return memory_pool[page_end].data.get();
+}
+
+void MemoryPool::set_free_chunk() {
+  memory_pool[page_end].is_free = true;
+  block_mutexes[page_end].unlock();
+  cv_empty.notify_one();
+  ++empty_blocks;
+}
diff --git a/src/test/csrc/fpga/mpool.h b/src/test/csrc/fpga/mpool.h
new file mode 100644
index 000000000..7c3242588
--- /dev/null
+++ b/src/test/csrc/fpga/mpool.h
@@ -0,0 +1,83 @@
+/***************************************************************************************
+* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
+* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
+*
+* DiffTest is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#define MEMPOOL_SIZE   4096 * 1024 // 4M page
+#define MEMBLOCK_SIZE  4096        // 4K packge
+#define NUM_BLOCKS     (MEMPOOL_SIZE / MEMBLOCK_SIZE)
+#define REM_NUM_BLOCKS (NUM_BLOCKS - 1)
+
+extern bool running;
+class MemoryPool {
+public:
+  // Constructor to allocate aligned memory blocks
+  MemoryPool() {
+    init_memory_pool();
+  }
+
+  ~MemoryPool() {
+    cleanup_memory_pool();
+  }
+  // Disable copy constructors and copy assignment operators
+  MemoryPool(const MemoryPool &) = delete;
+  MemoryPool &operator=(const MemoryPool &) = delete;
+
+  void init_memory_pool();
+
+  // Cleaning up memory pools
+  void cleanup_memory_pool();
+  // Releasing locks manually
+  void unlock_thread();
+
+  // Detect a free block and lock the memory that returns the free block
+  char *get_free_chunk();
+  // Set block data valid and locked
+  void set_busy_chunk();
+
+  // Gets the latest block of memory
+  const char *get_busy_chunk();
+  // Invalidate and lock the block
+  void set_free_chunk();
+
+private:
+  struct MemoryBlock {
+    std::unique_ptr<char, std::function<void(char *)>> data;
+    bool is_free;
+
+    MemoryBlock() : is_free(true) {
+      void *ptr = nullptr;
+      if (posix_memalign(&ptr, MEMBLOCK_SIZE, MEMBLOCK_SIZE * 2) != 0) {
+        throw std::runtime_error("Failed to allocate aligned memory");
+      }
+      data = std::unique_ptr<char, std::function<void(char *)>>(static_cast<char *>(ptr), [](char *p) { free(p); });
+    }
+  };
+  std::vector<MemoryBlock> memory_pool;              // Mempool
+  std::vector<std::mutex> block_mutexes{NUM_BLOCKS}; // Partition lock array
+  std::atomic<size_t> empty_blocks = NUM_BLOCKS;     // Free block count
+  std::atomic<size_t> filled_blocks;                 // Filled blocks count
+  std::atomic<size_t> write_index;
+  std::atomic<size_t> read_index;
+  std::condition_variable cv_empty;  // Free block condition variable
+  std::condition_variable cv_filled; // Filled block condition variable
+  size_t page_head = 0;
+  size_t page_end = 0;
+};
diff --git a/src/test/csrc/fpga/xdma_mpool.cpp b/src/test/csrc/fpga/xdma_mpool.cpp
deleted file mode 100644
index e69de29bb..000000000

From 4b118ef50be7f61e4184de6c22b5eeb761bff974 Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Wed, 28 Aug 2024 15:53:21 +0800
Subject: [PATCH 03/17] difftest: Move mempool to common code

---
 src/test/csrc/{fpga => common}/mpool.cpp | 0
 src/test/csrc/{fpga => common}/mpool.h   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/test/csrc/{fpga => common}/mpool.cpp (100%)
 rename src/test/csrc/{fpga => common}/mpool.h (100%)

diff --git a/src/test/csrc/fpga/mpool.cpp b/src/test/csrc/common/mpool.cpp
similarity index 100%
rename from src/test/csrc/fpga/mpool.cpp
rename to src/test/csrc/common/mpool.cpp
diff --git a/src/test/csrc/fpga/mpool.h b/src/test/csrc/common/mpool.h
similarity index 100%
rename from src/test/csrc/fpga/mpool.h
rename to src/test/csrc/common/mpool.h

From 740232ddec41436623240f70b577c8e79a0f5dad Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Thu, 29 Aug 2024 11:18:48 +0800
Subject: [PATCH 04/17] fpga: Add function of difftest through dma interface

---
 src/test/csrc/common/mpool.h        |   5 ++
 src/test/csrc/difftest/difftest.cpp |   2 +
 src/test/csrc/fpga/fpga_main.cpp    | 103 ++++++++++++++++++++++++++++
 src/test/csrc/fpga/xdma.cpp         |  42 ++++++++----
 src/test/csrc/fpga/xdma.h           |  31 +++++----
 5 files changed, 154 insertions(+), 29 deletions(-)
 create mode 100644 src/test/csrc/fpga/fpga_main.cpp

diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index 7c3242588..2aafdea48 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -13,6 +13,9 @@
 *
 * See the Mulan PSL v2 for more details.
 ***************************************************************************************/
+#ifndef __MPOOL_H__
+#define __MPOOL_H__
+
 #include <atomic>
 #include <condition_variable>
 #include <functional>
@@ -81,3 +84,5 @@ class MemoryPool {
   size_t page_head = 0;
   size_t page_end = 0;
 };
+
+#endif
diff --git a/src/test/csrc/difftest/difftest.cpp b/src/test/csrc/difftest/difftest.cpp
index 095c1fdb5..936c8e991 100644
--- a/src/test/csrc/difftest/difftest.cpp
+++ b/src/test/csrc/difftest/difftest.cpp
@@ -92,7 +92,9 @@ void difftest_set_dut() {
   }
 }
 int difftest_step() {
+#ifndef WITH_FPGA
   difftest_set_dut();
+#endif
   for (int i = 0; i < NUM_CORES; i++) {
     int ret = difftest[i]->step();
     if (ret) {
diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
new file mode 100644
index 000000000..864590915
--- /dev/null
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -0,0 +1,103 @@
+/***************************************************************************************
+* Copyright (c) 2024 Beijing Institute of Open Source Chip (BOSC)
+* Copyright (c) 2020-2024 Institute of Computing Technology, Chinese Academy of Sciences
+*
+* DiffTest is licensed under Mulan PSL v2.
+* You can use this software according to the terms and conditions of the Mulan PSL v2.
+* You may obtain a copy of Mulan PSL v2 at:
+*          http://license.coscl.org.cn/MulanPSL2
+*
+* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+*
+* See the Mulan PSL v2 for more details.
+***************************************************************************************/
+
+#include "difftest.h"
+#include "diffstate.h"
+#include "mpool.h"
+#include "xdma.h"
+
+#define XDMA_C2H_DEVICE "/dev/xdma0_c2h_0"
+
+enum {
+  SIMV_RUN,
+  SIMV_DONE,
+  SIMV_FAIL,
+} simv_state;
+
+static uint8_t simv_result = SIMV_RUN;
+static uint64_t max_instrs = 0;
+
+struct core_end_info_t {
+  bool core_trap[NUM_CORES];
+  double core_cpi[NUM_CORES];
+  uint8_t core_trap_num;
+};
+static core_end_info_t core_end_info;
+
+void simv_init();
+void simv_step();
+void cpu_endtime_check();
+void set_dut_from_xdma();
+
+FpgaXdma *xdma_device = NULL; 
+
+int main(int argc, char *argv[]) {
+
+  simv_init();
+
+  while (simv_result == SIMV_RUN) {
+    // get xdma data
+    set_dut_from_xdma();
+
+    // run difftest
+    simv_step();
+    cpu_endtime_check();
+  }
+}
+
+void set_dut_from_xdma() {
+  {
+    std::unique_lock<std::mutex> lock(xdma_device->diff_mtx);
+    xdma_device->diff_filled_cv.wait(lock, [] { return xdma_device->diff_packge_filled; });
+    for (int i = 0; i < NUM_CORES; i++) {
+
+      difftest[i]->dut = &xdma_device->difftest_pack[i];
+    }
+    xdma_device->diff_packge_filled = false;
+    xdma_device->diff_empile_cv.notify_one();
+  }
+}
+
+void simv_init() {
+  xdma_device = new FpgaXdma(XDMA_C2H_DEVICE);
+  difftest_init();
+  max_instrs = 40000000;
+}
+
+void simv_step() {
+  if (difftest_step())
+    simv_result = SIMV_FAIL;
+}
+
+void cpu_endtime_check() {
+  if (max_instrs != 0) { // 0 for no limit
+    for (int i = 0; i < NUM_CORES; i++) {
+      if (core_end_info.core_trap[i])
+        continue;
+      auto trap = difftest[i]->get_trap_event();
+      if (max_instrs < trap->instrCnt) {
+        core_end_info.core_trap[i] = true;
+        core_end_info.core_trap_num++;
+        eprintf(ANSI_COLOR_GREEN "EXCEEDED CORE-%d MAX INSTR: %ld\n" ANSI_COLOR_RESET, i, max_instrs);
+        difftest[i]->display_stats();
+        core_end_info.core_cpi[i] = (double)trap->cycleCnt / (double)trap->instrCnt;
+        if (core_end_info.core_trap_num == NUM_CORES) {
+          simv_result = SIMV_DONE;
+        }
+      }
+    }
+  }
+}
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index f55f58ab5..589bb32c7 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -13,11 +13,15 @@
 *
 * See the Mulan PSL v2 for more details.
 ***************************************************************************************/
+#include <fcntl.h>
+#include <signal.h>
+
 #include "xdma.h"
+#include "mpool.h"
 
-FpgaXdma::FpgaXdma() {
+FpgaXdma::FpgaXdma(const char *device_name) {
   signal(SIGINT, handle_sigint);
-  fd_c2h = open("/dev/xdma0_c2h_0", O_RDWR);
+  fd_c2h = open(device_name, O_RDWR);
   set_dma_fd_block();
 }
 
@@ -27,37 +31,47 @@ void FpgaXdma::handle_sigint(int sig) {
 }
 
 void FpgaXdma::set_dma_fd_block() {
-  int flags = fcntl(fd, F_GETFL, 0);
+  int flags = fcntl(fd_c2h, F_GETFL, 0);
   if (flags == -1) {
     perror("fcntl get error");
     return;
   }
   // Clear the O NONBLOCK flag and set it to blocking mode
   flags &= ~O_NONBLOCK;
-  if (fcntl(fd, F_SETFL, flags) == -1) {
+  if (fcntl(fd_c2h, F_SETFL, flags) == -1) {
     perror("fcntl set error");
     return;
   }
 }
 
-void FpgaXdma::thread_read_xdma() {
+void FpgaXdma::read_xdma_thread() {
   while (running) {
-    char *memory = memory_pool.get_free_chunk();
+    char *memory = xdma_mempool.get_free_chunk();
     read(fd_c2h, memory, recv_size);
-    memory_pool.set_busy_chunk();
+    xdma_mempool.set_busy_chunk();
   }
 }
 
 void FpgaXdma::write_difftest_thread() {
   while (running) {
-    const char *memory = memory_pool.get_busy_chunk();
-    memcpy(&diffteststate, memory, sizeof(diffteststate));
-
-    stream_receiver_cout ++;
-    memory_pool.set_free_chunk();
+    const char *memory = xdma_mempool.get_busy_chunk();
+    static uint8_t valid_core = 0;
+    uint8_t core_id = 0;
 
-// Notify difftest to run the next beat
-  
+    memcpy(&core_id, memory + sizeof(DiffTestState), sizeof(uint8_t));
+    assert(core_id > NUM_CORES);
+    {
+      std::unique_lock<std::mutex> lock(diff_mtx);
+      diff_empile_cv.wait(lock, [this] { return !diff_packge_filled; });
+      memcpy(&difftest_pack[core_id], memory, sizeof(DiffTestState));
+    }
+    valid_core ++;
+    xdma_mempool.set_free_chunk();
 
+    if (core_id == NUM_CORES) {
+      diff_packge_filled = true;
+      // Notify difftest to run the next check
+      diff_filled_cv.notify_one();
+    }
   }
 }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index 223dda98d..41112ad6f 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -16,7 +16,6 @@
 #ifndef __XDMA_H__
 #define __XDMA_H__
 
-#include "common.h"
 #include <queue>
 #include <stdbool.h>
 #include <stdio.h>
@@ -24,25 +23,25 @@
 #include <sys/shm.h>
 #include <vector>
 
+#include "common.h"
 #include "diffstate.h"
+#include "mpool.h"
 
-#define MAX_DATA_LEN    1024 * 8 - 1
+#define WITH_FPGA
 #define HEAD_DATA_LEN   7
 #define BUFSIZE         1024 * 8 * 8
-#define SHMSZ           27
 #define WAIT_RECV_SLEEP 5
 
 typedef struct FpgaPackgeHead {
-  struct DiffTestState difftestinfo;
-  unsigned int sequence : 16;
-  unsigned int message_size : 16;
-  unsigned long data[HEAD_DATA_LEN];
+  DiffTestState difftestinfo;
+  uint8_t corid;
 } FpgaPackgeHead;
 
 class FpgaXdma {
 public:
   struct FpgaPackgeHead *shmadd_recv;
-
+  MemoryPool xdma_mempool;
+  DiffTestState difftest_pack[NUM_CORES] = {};
   int shmid_recv;
   int ret_recv;
   key_t key_recv;
@@ -50,22 +49,24 @@ class FpgaXdma {
   int fd_c2h;
   int fd_interrupt;
 
-  struct FpgaPackgeHead recv_buffer;
-  unsigned long buffer[8];
   unsigned int recv_size = sizeof(FpgaPackgeHead);
   unsigned long old_exec_instr = 0;
 
-  FpgaXdma();
+  std::condition_variable diff_filled_cv;
+  std::condition_variable diff_empile_cv;
+  std::mutex diff_mtx;
+  bool diff_packge_filled = false;
+  FpgaXdma(const char *device_name);
   ~FpgaXdma() {};
 
   void set_dma_fd_block();
-  void handle_sigint(int sig);
+
+  // thread api
   void read_xdma_thread();
   void write_difftest_thread();
 
-protected:
-  std::mutex test_mtx;
-  std::condition_variable test_cv;
+private:
+  static void handle_sigint(int sig);
 };
 
 #endif

From 120d708726888bcb2d5bb35f1674573e0eb8c02b Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Thu, 29 Aug 2024 15:24:00 +0800
Subject: [PATCH 05/17] fpga: add independent compilation and usage support
 under fpga

---
 Makefile                         |  1 +
 fpga.mk                          | 19 +++++++++++++++++
 src/test/csrc/common/mpool.h     |  1 -
 src/test/csrc/fpga/fpga_main.cpp | 26 +++++++++++++++++++++---
 src/test/csrc/fpga/xdma.cpp      | 35 +++++++++++++++++++++++++++-----
 src/test/csrc/fpga/xdma.h        | 22 ++++++++++++--------
 6 files changed, 86 insertions(+), 18 deletions(-)
 create mode 100644 fpga.mk

diff --git a/Makefile b/Makefile
index 04bd1054d..f4905d173 100644
--- a/Makefile
+++ b/Makefile
@@ -231,6 +231,7 @@ include verilator.mk
 include vcs.mk
 include palladium.mk
 include libso.mk
+include fpga.mk
 
 clean: vcs-clean pldm-clean
 	rm -rf $(BUILD_DIR)
diff --git a/fpga.mk b/fpga.mk
new file mode 100644
index 000000000..e28792301
--- /dev/null
+++ b/fpga.mk
@@ -0,0 +1,19 @@
+
+FPGA           = FPGA_HOST
+FPGA_TARGET    = $(abspath $(BUILD_DIR)/simv)
+FPGA_BUILD_DIR = $(abspath $(BUILD_DIR)/simv-compile)
+FPGA_RUN_DIR   = $(abspath $(BUILD_DIR)/$(notdir $(RUN_BIN)))
+
+FPGA_CSRC_DIR   = $(abspath ./src/test/csrc/fpga)
+FPGA_CONFIG_DIR = $(abspath ./config)
+
+FPGA_CXXFILES  = $(SIM_CXXFILES) $(shell find $(FPGA_CSRC_DIR) -name "*.cpp")
+FPGA_CXXFLAGS  = $(subst \\\",\", $(SIM_CXXFLAGS)) -I$(FPGA_CSRC_DIR) -DNUM_CORES=$(NUM_CORES)
+FPGA_LDFLAGS   = $(SIM_LDFLAGS) -lpthread -ldl
+
+fpga-build: fpga-clean fpga-host
+
+fpga-host:
+	$(CXX) $(FPGA_CXXFLAGS) $(FPGA_CXXFILES) $^ -o $@ $(FPGA_LDFLAGS)
+fpga-clean:
+	rm -f fpga-host
diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index 2aafdea48..0925e3043 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -28,7 +28,6 @@
 #define NUM_BLOCKS     (MEMPOOL_SIZE / MEMBLOCK_SIZE)
 #define REM_NUM_BLOCKS (NUM_BLOCKS - 1)
 
-extern bool running;
 class MemoryPool {
 public:
   // Constructor to allocate aligned memory blocks
diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
index 864590915..3f51d21a5 100644
--- a/src/test/csrc/fpga/fpga_main.cpp
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -14,9 +14,10 @@
 * See the Mulan PSL v2 for more details.
 ***************************************************************************************/
 
-#include "difftest.h"
 #include "diffstate.h"
+#include "difftest.h"
 #include "mpool.h"
+#include "refproxy.h"
 #include "xdma.h"
 
 #define XDMA_C2H_DEVICE "/dev/xdma0_c2h_0"
@@ -41,11 +42,13 @@ void simv_init();
 void simv_step();
 void cpu_endtime_check();
 void set_dut_from_xdma();
+void set_diff_ref_so(char *s);
+void args_parsingniton(int argc, char *argv[]);
 
-FpgaXdma *xdma_device = NULL; 
+FpgaXdma *xdma_device = NULL;
 
 int main(int argc, char *argv[]) {
-
+  args_parsingniton(argc, argv);
   simv_init();
 
   while (simv_result == SIMV_RUN) {
@@ -56,6 +59,15 @@ int main(int argc, char *argv[]) {
     simv_step();
     cpu_endtime_check();
   }
+  free(xdma_device);
+}
+
+void set_diff_ref_so(char *s) {
+  extern const char *difftest_ref_so;
+  printf("diff-test ref so:%s\n", s);
+  char *buf = (char *)malloc(256);
+  strcpy(buf, s);
+  difftest_ref_so = buf;
 }
 
 void set_dut_from_xdma() {
@@ -101,3 +113,11 @@ void cpu_endtime_check() {
     }
   }
 }
+
+void args_parsingniton(int argc, char *argv[]) {
+  for (int i = 1; i < argc; ++i) {
+    if (strcmp(argv[i], "--diff") == 0) {
+      set_diff_ref_so(argv[++i]);
+    }
+  }
+}
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index 589bb32c7..f586f834a 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -13,15 +13,19 @@
 *
 * See the Mulan PSL v2 for more details.
 ***************************************************************************************/
-#include <fcntl.h>
-#include <signal.h>
-
 #include "xdma.h"
 #include "mpool.h"
+#include <fcntl.h>
+#include <signal.h>
 
 FpgaXdma::FpgaXdma(const char *device_name) {
   signal(SIGINT, handle_sigint);
   fd_c2h = open(device_name, O_RDWR);
+  if (fd_c2h == -1) {
+    printf("xdma device not find %s\n", device_name);
+    exit(1);
+  }
+  printf("xdma device %s\n", device_name);
   set_dma_fd_block();
 }
 
@@ -34,6 +38,7 @@ void FpgaXdma::set_dma_fd_block() {
   int flags = fcntl(fd_c2h, F_GETFL, 0);
   if (flags == -1) {
     perror("fcntl get error");
+    exit(1);
     return;
   }
   // Clear the O NONBLOCK flag and set it to blocking mode
@@ -44,6 +49,25 @@ void FpgaXdma::set_dma_fd_block() {
   }
 }
 
+void FpgaXdma::start_transmit_thread() {
+  if (running == true)
+    return;
+  receive_thread = std::thread(&FpgaXdma::read_xdma_thread, this);
+  process_thread = std::thread(&FpgaXdma::write_difftest_thread, this);
+  running = true;
+}
+
+void FpgaXdma::stop_thansmit_thread() {
+  if (running == false)
+    return;
+  xdma_mempool.unlock_thread();
+  if (receive_thread.joinable())
+    receive_thread.join();
+  if (process_thread.joinable())
+    process_thread.join();
+  running = false;
+}
+
 void FpgaXdma::read_xdma_thread() {
   while (running) {
     char *memory = xdma_mempool.get_free_chunk();
@@ -65,11 +89,12 @@ void FpgaXdma::write_difftest_thread() {
       diff_empile_cv.wait(lock, [this] { return !diff_packge_filled; });
       memcpy(&difftest_pack[core_id], memory, sizeof(DiffTestState));
     }
-    valid_core ++;
+    valid_core++;
     xdma_mempool.set_free_chunk();
 
-    if (core_id == NUM_CORES) {
+    if (valid_core == NUM_CORES) {
       diff_packge_filled = true;
+      valid_core = 0;
       // Notify difftest to run the next check
       diff_filled_cv.notify_one();
     }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index 41112ad6f..c5d5d2c15 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -16,22 +16,18 @@
 #ifndef __XDMA_H__
 #define __XDMA_H__
 
+#include "common.h"
+#include "diffstate.h"
+#include "mpool.h"
 #include <queue>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/shm.h>
+#include <thread>
 #include <vector>
 
-#include "common.h"
-#include "diffstate.h"
-#include "mpool.h"
-
 #define WITH_FPGA
-#define HEAD_DATA_LEN   7
-#define BUFSIZE         1024 * 8 * 8
-#define WAIT_RECV_SLEEP 5
-
 typedef struct FpgaPackgeHead {
   DiffTestState difftestinfo;
   uint8_t corid;
@@ -48,6 +44,7 @@ class FpgaXdma {
 
   int fd_c2h;
   int fd_interrupt;
+  bool running = false;
 
   unsigned int recv_size = sizeof(FpgaPackgeHead);
   unsigned long old_exec_instr = 0;
@@ -57,15 +54,22 @@ class FpgaXdma {
   std::mutex diff_mtx;
   bool diff_packge_filled = false;
   FpgaXdma(const char *device_name);
-  ~FpgaXdma() {};
+  ~FpgaXdma() {
+    stop_thansmit_thread();
+  };
 
   void set_dma_fd_block();
 
   // thread api
+  void start_transmit_thread();
+  void stop_thansmit_thread();
   void read_xdma_thread();
   void write_difftest_thread();
 
 private:
+  std::thread receive_thread;
+  std::thread process_thread;
+
   static void handle_sigint(int sig);
 };
 

From 10f3427ef4ba08e5622c1122bf84401c5dcad611 Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Wed, 18 Sep 2024 18:13:54 +0800
Subject: [PATCH 06/17] fpga: modify the xdma initi process

---
 fpga.mk                          |  3 ++
 src/test/csrc/fpga/fpga_main.cpp |  4 +--
 src/test/csrc/fpga/xdma.cpp      | 55 +++++++++++++++++---------------
 src/test/csrc/fpga/xdma.h        | 13 +++++---
 4 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/fpga.mk b/fpga.mk
index e28792301..11e08ceb9 100644
--- a/fpga.mk
+++ b/fpga.mk
@@ -11,6 +11,9 @@ FPGA_CXXFILES  = $(SIM_CXXFILES) $(shell find $(FPGA_CSRC_DIR) -name "*.cpp")
 FPGA_CXXFLAGS  = $(subst \\\",\", $(SIM_CXXFLAGS)) -I$(FPGA_CSRC_DIR) -DNUM_CORES=$(NUM_CORES)
 FPGA_LDFLAGS   = $(SIM_LDFLAGS) -lpthread -ldl
 
+DMA_CHANNELS?=1
+FPGA_LDFLAGS += -DCONFIG_DMA_CHANNELS=$(DMA_CHANNELS)
+
 fpga-build: fpga-clean fpga-host
 
 fpga-host:
diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
index 3f51d21a5..274325c15 100644
--- a/src/test/csrc/fpga/fpga_main.cpp
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -20,8 +20,6 @@
 #include "refproxy.h"
 #include "xdma.h"
 
-#define XDMA_C2H_DEVICE "/dev/xdma0_c2h_0"
-
 enum {
   SIMV_RUN,
   SIMV_DONE,
@@ -84,7 +82,7 @@ void set_dut_from_xdma() {
 }
 
 void simv_init() {
-  xdma_device = new FpgaXdma(XDMA_C2H_DEVICE);
+  xdma_device = new FpgaXdma;
   difftest_init();
   max_instrs = 40000000;
 }
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index f586f834a..985934f58 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -18,15 +18,31 @@
 #include <fcntl.h>
 #include <signal.h>
 
-FpgaXdma::FpgaXdma(const char *device_name) {
+#define XDMA_C2H_DEVICE "/dev/xdma0_c2h_"
+#define XDMA_H2C_DEVICE "/dev/xdma0_h2c_0"
+static const int dma_channel = CONFIG_DMA_CHANNELS;
+
+FpgaXdma::FpgaXdma() {
   signal(SIGINT, handle_sigint);
-  fd_c2h = open(device_name, O_RDWR);
-  if (fd_c2h == -1) {
-    printf("xdma device not find %s\n", device_name);
-    exit(1);
+  for (int channel = 0; i < dma_channel; channel ++) {
+    char c2h_device[64];
+    sprintf(c2h_device,"%s%d",DEVICE_C2H_NAME,i); 
+    xdma_c2h_fd[i] = open(c2h_device, O_RDONLY );
+    if (xdma_c2h_fd[i] == -1) {
+      std::cout << c2h_device << std::endl;
+      perror("Failed to open XDMA device");
+      exit(-1);
+    }
+    std::cout << "XDMA link " << c2h_device << std::endl;
+  }
+
+  xdma_h2c_fd[i] = open(h2c_device, O_WRONLY);
+  if (xdma_h2c_fd[i] == -1) {
+    std::cout << h2c_device << std::endl;
+    perror("Failed to open XDMA device");
+    exit(-1);
   }
-  printf("xdma device %s\n", device_name);
-  set_dma_fd_block();
+  std::cout << "XDMA link " << h2c_device << std::endl;
 }
 
 void FpgaXdma::handle_sigint(int sig) {
@@ -34,26 +50,15 @@ void FpgaXdma::handle_sigint(int sig) {
   exit(1);
 }
 
-void FpgaXdma::set_dma_fd_block() {
-  int flags = fcntl(fd_c2h, F_GETFL, 0);
-  if (flags == -1) {
-    perror("fcntl get error");
-    exit(1);
-    return;
-  }
-  // Clear the O NONBLOCK flag and set it to blocking mode
-  flags &= ~O_NONBLOCK;
-  if (fcntl(fd_c2h, F_SETFL, flags) == -1) {
-    perror("fcntl set error");
-    return;
-  }
-}
-
 void FpgaXdma::start_transmit_thread() {
   if (running == true)
     return;
-  receive_thread = std::thread(&FpgaXdma::read_xdma_thread, this);
-  process_thread = std::thread(&FpgaXdma::write_difftest_thread, this);
+
+  for(int i = 0; i < dma_channel;i ++) {
+    printf("start channel %d \n", i);
+    receive_thread[i] = std::thread(&FpgaXdma::read_xdma_thread, this, i);
+  }
+  process_thread[i] = std::thread(&FpgaXdma::write_difftest_thread, this, i);
   running = true;
 }
 
@@ -68,7 +73,7 @@ void FpgaXdma::stop_thansmit_thread() {
   running = false;
 }
 
-void FpgaXdma::read_xdma_thread() {
+void FpgaXdma::read_xdma_thread(int channel) {
   while (running) {
     char *memory = xdma_mempool.get_free_chunk();
     read(fd_c2h, memory, recv_size);
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index c5d5d2c15..d5572bea1 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -36,13 +36,16 @@ typedef struct FpgaPackgeHead {
 class FpgaXdma {
 public:
   struct FpgaPackgeHead *shmadd_recv;
-  MemoryPool xdma_mempool;
+
+  MemoryPool xdma_mempool[DMA_CHANNS];
   DiffTestState difftest_pack[NUM_CORES] = {};
   int shmid_recv;
   int ret_recv;
   key_t key_recv;
 
-  int fd_c2h;
+  int xdma_c2h_fd[DMA_CHANNS];
+  int xdma_h2c_fd;
+
   int fd_interrupt;
   bool running = false;
 
@@ -53,7 +56,7 @@ class FpgaXdma {
   std::condition_variable diff_empile_cv;
   std::mutex diff_mtx;
   bool diff_packge_filled = false;
-  FpgaXdma(const char *device_name);
+  FpgaXdma();
   ~FpgaXdma() {
     stop_thansmit_thread();
   };
@@ -67,8 +70,8 @@ class FpgaXdma {
   void write_difftest_thread();
 
 private:
-  std::thread receive_thread;
-  std::thread process_thread;
+  std::thread receive_thread[DMA_CHANNS];
+  std::thread process_thread[DMA_CHANNS];
 
   static void handle_sigint(int sig);
 };

From 308b05648b1a4dc595d765cf7b823e8592fb5530 Mon Sep 17 00:00:00 2001
From: xiaokamikami <fengkehan@bosc.ac.cn>
Date: Fri, 20 Sep 2024 15:34:51 +0800
Subject: [PATCH 07/17] difftest: Fixed an issue where the block structure was
 not memory safe

---
 src/test/csrc/common/mpool.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index 0925e3043..0484d6348 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -22,12 +22,37 @@
 #include <memory>
 #include <mutex>
 #include <vector>
+#include <cstring>
 
 #define MEMPOOL_SIZE   4096 * 1024 // 4M page
 #define MEMBLOCK_SIZE  4096        // 4K packge
 #define NUM_BLOCKS     (MEMPOOL_SIZE / MEMBLOCK_SIZE)
 #define REM_NUM_BLOCKS (NUM_BLOCKS - 1)
 
+struct MemoryBlock {
+  std::unique_ptr<char[], std::function<void(char*)>> data;
+  std::atomic<bool> is_free;
+
+  MemoryBlock() : is_free(true) {
+    void* ptr = nullptr;
+    if (posix_memalign(&ptr, 4096, 4096) != 0) {
+      throw std::runtime_error("Failed to allocate aligned memory");
+    }
+    memset(ptr, 0, 4096);
+    data = std::unique_ptr<char[], std::function<void(char*)>>(
+      static_cast<char*>(ptr),
+      [](char* p) { free(p); }
+    );
+  }
+  // Disable copy operations
+  MemoryBlock(const MemoryBlock&) = delete;
+  MemoryBlock& operator=(const MemoryBlock&) = delete;
+
+  // Enable move operations
+  MemoryBlock(MemoryBlock&&) = default;
+  MemoryBlock& operator=(MemoryBlock&&) = default;
+};
+
 class MemoryPool {
 public:
   // Constructor to allocate aligned memory blocks
@@ -74,7 +99,7 @@ class MemoryPool {
   };
   std::vector<MemoryBlock> memory_pool;              // Mempool
   std::vector<std::mutex> block_mutexes{NUM_BLOCKS}; // Partition lock array
-  std::atomic<size_t> empty_blocks = NUM_BLOCKS;     // Free block count
+  std::atomic<size_t> empty_blocks {NUM_BLOCKS};     // Free block count
   std::atomic<size_t> filled_blocks;                 // Filled blocks count
   std::atomic<size_t> write_index;
   std::atomic<size_t> read_index;
@@ -84,4 +109,5 @@ class MemoryPool {
   size_t page_end = 0;
 };
 
+
 #endif

From 818fa62ad4f605ebedd5cfd91e70408b28f77ad3 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Mon, 23 Sep 2024 16:49:03 +0800
Subject: [PATCH 08/17] fpga: The memory pool with sliding window was added to
 realize multi-channel out-of-order reception of data packets

---
 src/test/csrc/common/mpool.cpp | 94 ++++++++++++++++++++++++++++++++++
 src/test/csrc/common/mpool.h   | 59 +++++++++++++++++++++
 2 files changed, 153 insertions(+)

diff --git a/src/test/csrc/common/mpool.cpp b/src/test/csrc/common/mpool.cpp
index 0e6d2122c..05e5caa30 100644
--- a/src/test/csrc/common/mpool.cpp
+++ b/src/test/csrc/common/mpool.cpp
@@ -70,3 +70,97 @@ void MemoryPool::set_free_chunk() {
   cv_empty.notify_one();
   ++empty_blocks;
 }
+
+// Cleaning up memory pools
+void MemoryIdxPool::cleanupMemoryPool() {
+  cv_empty.notify_all();
+  cv_filled.notify_all();
+}
+
+// Write a specified free block of a free window
+bool MemoryIdxPool::write_free_chunk(uint8_t idx, const char *data) {
+  size_t page_w_idx;
+  {
+    std::lock_guard <std::mutex> lock(offset_mutexes);
+
+    page_w_idx = idx + group_w_offset;
+    // Processing of winding data at the boundary
+    if (memory_pool[page_w_idx].is_free.load() == false) {
+      size_t this_group = group_w_idx.load();
+      size_t offset = ((this_group & REM_MAX_GROUPING_IDX) * MAX_IDX);
+      page_w_idx = idx + offset;
+      write_next_count ++;
+      // Lookup failed
+      if (memory_pool[page_w_idx].is_free.load() == false) {
+        printf("This block has been written, and there is a duplicate packge idx %d\n",idx);
+        return false;
+      }
+    } else {
+      write_count ++;
+      // Proceed to the next group
+      if (write_count == MAX_IDX) {
+        memory_pool[page_w_idx].is_free.store(false);
+        memcpy(memory_pool[page_w_idx].data.get(), data, 4096);
+
+        size_t next_w_idx = wait_next_free_group();
+        group_w_offset = (next_w_idx & REM_MAX_GROUPING_IDX) * MAX_IDX;
+        write_count = write_next_count;
+        write_next_count = 0;
+      return true;
+      }
+    }
+    memory_pool[page_w_idx].is_free.store(false);     
+  }
+  memcpy(memory_pool[page_w_idx].data.get(), data, 4096);
+
+  return true;
+}
+
+bool MemoryIdxPool::read_busy_chunk(char *data) {
+  size_t page_r_idx = read_count + group_r_offset;
+  size_t this_r_idx = ++read_count;
+
+  if (this_r_idx == MAX_IDX) {
+    read_count = 0;
+    size_t next_r_idx = wait_next_full_group();
+    group_r_offset = ((next_r_idx & REM_MAX_GROUPING_IDX) * MAX_IDX);
+  }
+  if (memory_pool[page_r_idx].is_free.load() == true) {
+    printf("An attempt was made to read the block of free %d\n", page_r_idx);
+    return false;
+  }
+
+  memcpy(data, memory_pool[page_r_idx].data.get(), 4096);
+  memory_pool[page_r_idx].is_free.store(true);
+
+  return true;
+}
+
+size_t MemoryIdxPool::wait_next_free_group() {
+  empty_blocks.fetch_sub(1);
+  size_t free_num = empty_blocks.load();
+  cv_filled.notify_all();
+  //Reserve at least two free blocks
+  if (free_num <= 2) {
+    std::unique_lock<std::mutex> lock(window_mutexes);
+    cv_empty.wait(lock, [this] { return empty_blocks.load() > 1;});
+  }
+  return group_w_idx.fetch_add(1);
+}
+
+size_t MemoryIdxPool::wait_next_full_group() {
+  empty_blocks.fetch_add(1);
+  size_t free_num = empty_blocks.load();
+  cv_empty.notify_all();
+
+  if (free_num >= MAX_GROUP_READ) {
+    std::unique_lock<std::mutex> lock(window_mutexes);
+    cv_filled.wait(lock, [this] { return empty_blocks.load() < MAX_GROUP_READ;});
+  }
+  return group_r_idx.fetch_add(1);
+}
+
+bool MemoryIdxPool::check_group() {
+  bool result = (group_w_idx.load() > group_r_idx.load()) ? true : false;
+  return result;
+}
\ No newline at end of file
diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index 0484d6348..ed6e10f0c 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -110,4 +110,63 @@ class MemoryPool {
 };
 
 
+static const size_t MAX_IDX = 256;
+static const size_t MAX_GROUPING_IDX = NUM_BLOCKS / MAX_IDX;
+static const size_t MAX_GROUP_READ = MAX_GROUPING_IDX - 2; //窗口需要预留两个空闲空间
+static const size_t REM_MAX_IDX = (MAX_IDX - 1);
+static const size_t REM_MAX_GROUPING_IDX = (MAX_GROUPING_IDX - 1);
+
+// Split the memory pool into sliding Windows based on the index width
+// Support multi-thread out-of-order write sequential read
+class MemoryIdxPool {
+public:
+  MemoryIdxPool() {
+    initMemoryPool();
+  }
+
+  ~MemoryIdxPool() {
+    cleanupMemoryPool();
+  }
+  // Disable copy constructors and copy assignment operators
+  MemoryIdxPool(const MemoryIdxPool&) = delete;
+  MemoryIdxPool& operator=(const MemoryIdxPool&) = delete;
+
+  void initMemoryPool() {}
+
+  // Cleaning up memory pools
+  void cleanupMemoryPool();
+
+  // Write a specified free block of a free window
+  bool write_free_chunk(uint8_t idx, const char *data);
+
+  // Get the head memory
+  bool read_busy_chunk(char *data);
+
+  // Wait for the data to be free
+  size_t wait_next_free_group();
+
+  // Wait for the data to be readable
+  size_t wait_next_full_group();
+
+  // Check if there is a window to read
+  bool check_group();
+
+private:
+  MemoryBlock memory_pool[NUM_BLOCKS];  // Mempool
+  std::mutex window_mutexes; // window sliding protection
+  std::mutex offset_mutexes; // w/r offset protection
+  std::condition_variable cv_empty;  // Free block condition variable
+  std::condition_variable cv_filled; // Filled block condition variable
+
+  size_t group_r_offset = 0; // The offset used by the current consumer
+  size_t group_w_offset = 0; // The offset used by the current producer
+  size_t read_count = 0;
+  size_t write_count = 0;    
+  size_t write_next_count = 0;
+
+  std::atomic<size_t> empty_blocks{MAX_GROUP_READ};
+  std::atomic<size_t> group_w_idx{1};
+  std::atomic<size_t> group_r_idx{1};
+};
+
 #endif

From e07e5e2f2c7bf18450298bb6868157d493fb9396 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Mon, 23 Sep 2024 16:49:16 +0800
Subject: [PATCH 09/17] fpga: fix mpool format

---
 src/test/csrc/common/mpool.cpp | 18 ++++++++--------
 src/test/csrc/common/mpool.h   | 38 +++++++++++++++-------------------
 2 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/src/test/csrc/common/mpool.cpp b/src/test/csrc/common/mpool.cpp
index 05e5caa30..fcd08a5be 100644
--- a/src/test/csrc/common/mpool.cpp
+++ b/src/test/csrc/common/mpool.cpp
@@ -81,7 +81,7 @@ void MemoryIdxPool::cleanupMemoryPool() {
 bool MemoryIdxPool::write_free_chunk(uint8_t idx, const char *data) {
   size_t page_w_idx;
   {
-    std::lock_guard <std::mutex> lock(offset_mutexes);
+    std::lock_guard<std::mutex> lock(offset_mutexes);
 
     page_w_idx = idx + group_w_offset;
     // Processing of winding data at the boundary
@@ -89,14 +89,14 @@ bool MemoryIdxPool::write_free_chunk(uint8_t idx, const char *data) {
       size_t this_group = group_w_idx.load();
       size_t offset = ((this_group & REM_MAX_GROUPING_IDX) * MAX_IDX);
       page_w_idx = idx + offset;
-      write_next_count ++;
+      write_next_count++;
       // Lookup failed
       if (memory_pool[page_w_idx].is_free.load() == false) {
-        printf("This block has been written, and there is a duplicate packge idx %d\n",idx);
+        printf("This block has been written, and there is a duplicate packge idx %d\n", idx);
         return false;
       }
     } else {
-      write_count ++;
+      write_count++;
       // Proceed to the next group
       if (write_count == MAX_IDX) {
         memory_pool[page_w_idx].is_free.store(false);
@@ -106,10 +106,10 @@ bool MemoryIdxPool::write_free_chunk(uint8_t idx, const char *data) {
         group_w_offset = (next_w_idx & REM_MAX_GROUPING_IDX) * MAX_IDX;
         write_count = write_next_count;
         write_next_count = 0;
-      return true;
+        return true;
       }
     }
-    memory_pool[page_w_idx].is_free.store(false);     
+    memory_pool[page_w_idx].is_free.store(false);
   }
   memcpy(memory_pool[page_w_idx].data.get(), data, 4096);
 
@@ -143,7 +143,7 @@ size_t MemoryIdxPool::wait_next_free_group() {
   //Reserve at least two free blocks
   if (free_num <= 2) {
     std::unique_lock<std::mutex> lock(window_mutexes);
-    cv_empty.wait(lock, [this] { return empty_blocks.load() > 1;});
+    cv_empty.wait(lock, [this] { return empty_blocks.load() > 1; });
   }
   return group_w_idx.fetch_add(1);
 }
@@ -155,7 +155,7 @@ size_t MemoryIdxPool::wait_next_full_group() {
 
   if (free_num >= MAX_GROUP_READ) {
     std::unique_lock<std::mutex> lock(window_mutexes);
-    cv_filled.wait(lock, [this] { return empty_blocks.load() < MAX_GROUP_READ;});
+    cv_filled.wait(lock, [this] { return empty_blocks.load() < MAX_GROUP_READ; });
   }
   return group_r_idx.fetch_add(1);
 }
@@ -163,4 +163,4 @@ size_t MemoryIdxPool::wait_next_full_group() {
 bool MemoryIdxPool::check_group() {
   bool result = (group_w_idx.load() > group_r_idx.load()) ? true : false;
   return result;
-}
\ No newline at end of file
+}
diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index ed6e10f0c..2b26bb241 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -18,11 +18,11 @@
 
 #include <atomic>
 #include <condition_variable>
+#include <cstring>
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <vector>
-#include <cstring>
 
 #define MEMPOOL_SIZE   4096 * 1024 // 4M page
 #define MEMBLOCK_SIZE  4096        // 4K packge
@@ -30,27 +30,24 @@
 #define REM_NUM_BLOCKS (NUM_BLOCKS - 1)
 
 struct MemoryBlock {
-  std::unique_ptr<char[], std::function<void(char*)>> data;
+  std::unique_ptr<char[], std::function<void(char *)>> data;
   std::atomic<bool> is_free;
 
   MemoryBlock() : is_free(true) {
-    void* ptr = nullptr;
+    void *ptr = nullptr;
     if (posix_memalign(&ptr, 4096, 4096) != 0) {
       throw std::runtime_error("Failed to allocate aligned memory");
     }
     memset(ptr, 0, 4096);
-    data = std::unique_ptr<char[], std::function<void(char*)>>(
-      static_cast<char*>(ptr),
-      [](char* p) { free(p); }
-    );
+    data = std::unique_ptr<char[], std::function<void(char *)>>(static_cast<char *>(ptr), [](char *p) { free(p); });
   }
   // Disable copy operations
-  MemoryBlock(const MemoryBlock&) = delete;
-  MemoryBlock& operator=(const MemoryBlock&) = delete;
+  MemoryBlock(const MemoryBlock &) = delete;
+  MemoryBlock &operator=(const MemoryBlock &) = delete;
 
   // Enable move operations
-  MemoryBlock(MemoryBlock&&) = default;
-  MemoryBlock& operator=(MemoryBlock&&) = default;
+  MemoryBlock(MemoryBlock &&) = default;
+  MemoryBlock &operator=(MemoryBlock &&) = default;
 };
 
 class MemoryPool {
@@ -99,7 +96,7 @@ class MemoryPool {
   };
   std::vector<MemoryBlock> memory_pool;              // Mempool
   std::vector<std::mutex> block_mutexes{NUM_BLOCKS}; // Partition lock array
-  std::atomic<size_t> empty_blocks {NUM_BLOCKS};     // Free block count
+  std::atomic<size_t> empty_blocks{NUM_BLOCKS};      // Free block count
   std::atomic<size_t> filled_blocks;                 // Filled blocks count
   std::atomic<size_t> write_index;
   std::atomic<size_t> read_index;
@@ -109,7 +106,6 @@ class MemoryPool {
   size_t page_end = 0;
 };
 
-
 static const size_t MAX_IDX = 256;
 static const size_t MAX_GROUPING_IDX = NUM_BLOCKS / MAX_IDX;
 static const size_t MAX_GROUP_READ = MAX_GROUPING_IDX - 2; //窗口需要预留两个空闲空间
@@ -128,8 +124,8 @@ class MemoryIdxPool {
     cleanupMemoryPool();
   }
   // Disable copy constructors and copy assignment operators
-  MemoryIdxPool(const MemoryIdxPool&) = delete;
-  MemoryIdxPool& operator=(const MemoryIdxPool&) = delete;
+  MemoryIdxPool(const MemoryIdxPool &) = delete;
+  MemoryIdxPool &operator=(const MemoryIdxPool &) = delete;
 
   void initMemoryPool() {}
 
@@ -152,16 +148,16 @@ class MemoryIdxPool {
   bool check_group();
 
 private:
-  MemoryBlock memory_pool[NUM_BLOCKS];  // Mempool
-  std::mutex window_mutexes; // window sliding protection
-  std::mutex offset_mutexes; // w/r offset protection
-  std::condition_variable cv_empty;  // Free block condition variable
-  std::condition_variable cv_filled; // Filled block condition variable
+  MemoryBlock memory_pool[NUM_BLOCKS]; // Mempool
+  std::mutex window_mutexes;           // window sliding protection
+  std::mutex offset_mutexes;           // w/r offset protection
+  std::condition_variable cv_empty;    // Free block condition variable
+  std::condition_variable cv_filled;   // Filled block condition variable
 
   size_t group_r_offset = 0; // The offset used by the current consumer
   size_t group_w_offset = 0; // The offset used by the current producer
   size_t read_count = 0;
-  size_t write_count = 0;    
+  size_t write_count = 0;
   size_t write_next_count = 0;
 
   std::atomic<size_t> empty_blocks{MAX_GROUP_READ};

From b3a828c42dcc81d79cc73de332d484dc698e6bbc Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Mon, 23 Sep 2024 17:56:53 +0800
Subject: [PATCH 10/17] fpga: modify xdma to be multi-channel configurable and
 use a sliding window

---
 src/test/csrc/common/mpool.cpp |  2 +-
 src/test/csrc/common/mpool.h   |  7 ----
 src/test/csrc/fpga/xdma.cpp    | 70 +++++++++++++++++-----------------
 src/test/csrc/fpga/xdma.h      | 12 +++---
 4 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/src/test/csrc/common/mpool.cpp b/src/test/csrc/common/mpool.cpp
index fcd08a5be..4e83e63ae 100644
--- a/src/test/csrc/common/mpool.cpp
+++ b/src/test/csrc/common/mpool.cpp
@@ -126,7 +126,7 @@ bool MemoryIdxPool::read_busy_chunk(char *data) {
     group_r_offset = ((next_r_idx & REM_MAX_GROUPING_IDX) * MAX_IDX);
   }
   if (memory_pool[page_r_idx].is_free.load() == true) {
-    printf("An attempt was made to read the block of free %d\n", page_r_idx);
+    printf("An attempt was made to read the block of free %zu\n", page_r_idx);
     return false;
   }
 
diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index 2b26bb241..b78bf4ad1 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -41,13 +41,6 @@ struct MemoryBlock {
     memset(ptr, 0, 4096);
     data = std::unique_ptr<char[], std::function<void(char *)>>(static_cast<char *>(ptr), [](char *p) { free(p); });
   }
-  // Disable copy operations
-  MemoryBlock(const MemoryBlock &) = delete;
-  MemoryBlock &operator=(const MemoryBlock &) = delete;
-
-  // Enable move operations
-  MemoryBlock(MemoryBlock &&) = default;
-  MemoryBlock &operator=(MemoryBlock &&) = default;
 };
 
 class MemoryPool {
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index 985934f58..6a90f16e6 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -16,6 +16,7 @@
 #include "xdma.h"
 #include "mpool.h"
 #include <fcntl.h>
+#include <iostream>
 #include <signal.h>
 
 #define XDMA_C2H_DEVICE "/dev/xdma0_c2h_"
@@ -24,10 +25,10 @@ static const int dma_channel = CONFIG_DMA_CHANNELS;
 
 FpgaXdma::FpgaXdma() {
   signal(SIGINT, handle_sigint);
-  for (int channel = 0; i < dma_channel; channel ++) {
+  for (int i = 0; i < dma_channel; i++) {
     char c2h_device[64];
-    sprintf(c2h_device,"%s%d",DEVICE_C2H_NAME,i); 
-    xdma_c2h_fd[i] = open(c2h_device, O_RDONLY );
+    sprintf(c2h_device, "%s%d", XDMA_C2H_DEVICE, i);
+    xdma_c2h_fd[i] = open(c2h_device, O_RDONLY);
     if (xdma_c2h_fd[i] == -1) {
       std::cout << c2h_device << std::endl;
       perror("Failed to open XDMA device");
@@ -36,13 +37,13 @@ FpgaXdma::FpgaXdma() {
     std::cout << "XDMA link " << c2h_device << std::endl;
   }
 
-  xdma_h2c_fd[i] = open(h2c_device, O_WRONLY);
-  if (xdma_h2c_fd[i] == -1) {
-    std::cout << h2c_device << std::endl;
+  xdma_h2c_fd = open(XDMA_H2C_DEVICE, O_WRONLY);
+  if (xdma_h2c_fd == -1) {
+    std::cout << XDMA_H2C_DEVICE << std::endl;
     perror("Failed to open XDMA device");
     exit(-1);
   }
-  std::cout << "XDMA link " << h2c_device << std::endl;
+  std::cout << "XDMA link " << XDMA_H2C_DEVICE << std::endl;
 }
 
 void FpgaXdma::handle_sigint(int sig) {
@@ -54,54 +55,55 @@ void FpgaXdma::start_transmit_thread() {
   if (running == true)
     return;
 
-  for(int i = 0; i < dma_channel;i ++) {
+  for (int i = 0; i < dma_channel; i++) {
     printf("start channel %d \n", i);
     receive_thread[i] = std::thread(&FpgaXdma::read_xdma_thread, this, i);
   }
-  process_thread[i] = std::thread(&FpgaXdma::write_difftest_thread, this, i);
+  process_thread = std::thread(&FpgaXdma::write_difftest_thread, this);
   running = true;
 }
 
 void FpgaXdma::stop_thansmit_thread() {
   if (running == false)
     return;
-  xdma_mempool.unlock_thread();
-  if (receive_thread.joinable())
-    receive_thread.join();
+  running = false;
+
+  for (int i = 0; i < CONFIG_DMA_CHANNELS; i++) {
+    if (receive_thread[i].joinable())
+      receive_thread[i].join();
+    close(xdma_c2h_fd[i]);
+  }
+
   if (process_thread.joinable())
     process_thread.join();
-  running = false;
+
+  close(xdma_h2c_fd);
+  xdma_mempool.cleanupMemoryPool();
 }
 
 void FpgaXdma::read_xdma_thread(int channel) {
+  FpgaPackgeHead packge;
+  bool result = true;
   while (running) {
-    char *memory = xdma_mempool.get_free_chunk();
-    read(fd_c2h, memory, recv_size);
-    xdma_mempool.set_busy_chunk();
+    size_t size = read(xdma_c2h_fd[channel], &packge, sizeof(FpgaPackgeHead));
+    uint8_t idx = packge.packge_idx;
+    if (xdma_mempool.write_free_chunk(idx, (char *)&packge) == false) {
+      printf("It should not be the case that no available block can be found\n");
+      assert(0);
+    }
   }
 }
 
 void FpgaXdma::write_difftest_thread() {
+  FpgaPackgeHead packge;
+  bool result = true;
   while (running) {
-    const char *memory = xdma_mempool.get_busy_chunk();
-    static uint8_t valid_core = 0;
-    uint8_t core_id = 0;
-
-    memcpy(&core_id, memory + sizeof(DiffTestState), sizeof(uint8_t));
-    assert(core_id > NUM_CORES);
-    {
-      std::unique_lock<std::mutex> lock(diff_mtx);
-      diff_empile_cv.wait(lock, [this] { return !diff_packge_filled; });
-      memcpy(&difftest_pack[core_id], memory, sizeof(DiffTestState));
+    if (xdma_mempool.read_busy_chunk((char *)&packge) == false) {
+      printf("Failed to read data from the XDMA memory pool\n");
+      assert(0);
     }
-    valid_core++;
-    xdma_mempool.set_free_chunk();
+    // packge unpack
 
-    if (valid_core == NUM_CORES) {
-      diff_packge_filled = true;
-      valid_core = 0;
-      // Notify difftest to run the next check
-      diff_filled_cv.notify_one();
-    }
+    // difftest run
   }
 }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index d5572bea1..cb4307c60 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -30,20 +30,20 @@
 #define WITH_FPGA
 typedef struct FpgaPackgeHead {
   DiffTestState difftestinfo;
-  uint8_t corid;
+  uint8_t packge_idx;
 } FpgaPackgeHead;
 
 class FpgaXdma {
 public:
   struct FpgaPackgeHead *shmadd_recv;
 
-  MemoryPool xdma_mempool[DMA_CHANNS];
+  MemoryIdxPool xdma_mempool;
   DiffTestState difftest_pack[NUM_CORES] = {};
   int shmid_recv;
   int ret_recv;
   key_t key_recv;
 
-  int xdma_c2h_fd[DMA_CHANNS];
+  int xdma_c2h_fd[CONFIG_DMA_CHANNELS];
   int xdma_h2c_fd;
 
   int fd_interrupt;
@@ -66,12 +66,12 @@ class FpgaXdma {
   // thread api
   void start_transmit_thread();
   void stop_thansmit_thread();
-  void read_xdma_thread();
+  void read_xdma_thread(int channel);
   void write_difftest_thread();
 
 private:
-  std::thread receive_thread[DMA_CHANNS];
-  std::thread process_thread[DMA_CHANNS];
+  std::thread receive_thread[CONFIG_DMA_CHANNELS];
+  std::thread process_thread;
 
   static void handle_sigint(int sig);
 };

From 2f9ee343a634340718d25354acedcb3c36d52b39 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Tue, 24 Sep 2024 17:26:40 +0800
Subject: [PATCH 11/17] fpga: Improve the operation logic of fpga diff

---
 src/test/csrc/fpga/fpga_main.cpp | 50 ++++++++++++++++++--------------
 src/test/csrc/fpga/xdma.cpp      |  3 +-
 src/test/csrc/fpga/xdma.h        | 20 +++++--------
 3 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
index 274325c15..2f4ed5a11 100644
--- a/src/test/csrc/fpga/fpga_main.cpp
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -15,6 +15,7 @@
 ***************************************************************************************/
 
 #include "diffstate.h"
+#include "difftest-dpic.h"
 #include "difftest.h"
 #include "mpool.h"
 #include "refproxy.h"
@@ -26,6 +27,7 @@ enum {
   SIMV_FAIL,
 } simv_state;
 
+static char work_load[256] = "/dev/zero";
 static uint8_t simv_result = SIMV_RUN;
 static uint64_t max_instrs = 0;
 
@@ -39,7 +41,6 @@ static core_end_info_t core_end_info;
 void simv_init();
 void simv_step();
 void cpu_endtime_check();
-void set_dut_from_xdma();
 void set_diff_ref_so(char *s);
 void args_parsingniton(int argc, char *argv[]);
 
@@ -50,12 +51,13 @@ int main(int argc, char *argv[]) {
   simv_init();
 
   while (simv_result == SIMV_RUN) {
-    // get xdma data
-    set_dut_from_xdma();
-
-    // run difftest
-    simv_step();
-    cpu_endtime_check();
+    // wait get xdma data
+    if (xdma_device->diff_packge_count.load(std::memory_order_seq_cst) > 0) {
+      // run difftest
+      simv_step();
+      cpu_endtime_check();
+      xdma_device->diff_packge_count.fetch_sub(1, std::memory_order_relaxed);
+    }
   }
   free(xdma_device);
 }
@@ -68,28 +70,30 @@ void set_diff_ref_so(char *s) {
   difftest_ref_so = buf;
 }
 
-void set_dut_from_xdma() {
-  {
-    std::unique_lock<std::mutex> lock(xdma_device->diff_mtx);
-    xdma_device->diff_filled_cv.wait(lock, [] { return xdma_device->diff_packge_filled; });
-    for (int i = 0; i < NUM_CORES; i++) {
-
-      difftest[i]->dut = &xdma_device->difftest_pack[i];
-    }
-    xdma_device->diff_packge_filled = false;
-    xdma_device->diff_empile_cv.notify_one();
-  }
-}
-
 void simv_init() {
   xdma_device = new FpgaXdma;
   difftest_init();
-  max_instrs = 40000000;
 }
 
 void simv_step() {
   if (difftest_step())
     simv_result = SIMV_FAIL;
+  if (difftest_state() != -1) {
+    int trapCode = difftest_state();
+    for (int i = 0; i < NUM_CORES; i++) {
+      printf("Core %d: ", i);
+      uint64_t pc = difftest[i]->get_trap_event()->pc;
+      switch (trapCode) {
+        case 0: eprintf(ANSI_COLOR_GREEN "HIT GOOD TRAP at pc = 0x%" PRIx64 "\n" ANSI_COLOR_RESET, pc); break;
+        default: eprintf(ANSI_COLOR_RED "Unknown trap code: %d\n" ANSI_COLOR_RESET, trapCode);
+      }
+      difftest[i]->display_stats();
+    }
+    if (trapCode == 0)
+      simv_result = SIMV_DONE;
+    else
+      simv_result = SIMV_FAIL;
+  }
 }
 
 void cpu_endtime_check() {
@@ -116,6 +120,10 @@ void args_parsingniton(int argc, char *argv[]) {
   for (int i = 1; i < argc; ++i) {
     if (strcmp(argv[i], "--diff") == 0) {
       set_diff_ref_so(argv[++i]);
+    } else if (strcmp(argv[i], "-i") == 0) {
+      memcpy(work_load, argv[++i], sizeof(argv[++i]));
+    } else if (strcmp(argv[i], "--max-instrs") == 0) {
+      max_instrs = std::stoul(argv[++i], nullptr, 16);
     }
   }
 }
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index 6a90f16e6..6854bcfc4 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -103,7 +103,8 @@ void FpgaXdma::write_difftest_thread() {
       assert(0);
     }
     // packge unpack
-
+    v_difftest_Batch(packge.difftest_batch_info.io_data, packge.difftest_batch_info.io_info);
     // difftest run
+    diff_packge_count.fetch_add(1, std::memory_order_relaxed);
   }
 }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index cb4307c60..ae9eeeeda 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -18,7 +18,9 @@
 
 #include "common.h"
 #include "diffstate.h"
+#include "difftest-dpic.h"
 #include "mpool.h"
+#include <atomic>
 #include <queue>
 #include <stdbool.h>
 #include <stdio.h>
@@ -28,8 +30,9 @@
 #include <vector>
 
 #define WITH_FPGA
+
 typedef struct FpgaPackgeHead {
-  DiffTestState difftestinfo;
+  BatchInfo difftest_batch_info;
   uint8_t packge_idx;
 } FpgaPackgeHead;
 
@@ -38,31 +41,22 @@ class FpgaXdma {
   struct FpgaPackgeHead *shmadd_recv;
 
   MemoryIdxPool xdma_mempool;
-  DiffTestState difftest_pack[NUM_CORES] = {};
-  int shmid_recv;
-  int ret_recv;
-  key_t key_recv;
 
   int xdma_c2h_fd[CONFIG_DMA_CHANNELS];
   int xdma_h2c_fd;
 
-  int fd_interrupt;
   bool running = false;
 
-  unsigned int recv_size = sizeof(FpgaPackgeHead);
-  unsigned long old_exec_instr = 0;
-
   std::condition_variable diff_filled_cv;
   std::condition_variable diff_empile_cv;
-  std::mutex diff_mtx;
-  bool diff_packge_filled = false;
+
+  std::atomic<uint32_t> diff_packge_count{0};
+
   FpgaXdma();
   ~FpgaXdma() {
     stop_thansmit_thread();
   };
 
-  void set_dma_fd_block();
-
   // thread api
   void start_transmit_thread();
   void stop_thansmit_thread();

From 5988694501a0a3a4b31f1be406e9983c1b59906f Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Wed, 25 Sep 2024 11:35:26 +0800
Subject: [PATCH 12/17] fpga: Remove redundant mempool-MemoryBlock definitions

---
 src/test/csrc/common/mpool.h | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/test/csrc/common/mpool.h b/src/test/csrc/common/mpool.h
index b78bf4ad1..69d031e70 100644
--- a/src/test/csrc/common/mpool.h
+++ b/src/test/csrc/common/mpool.h
@@ -41,6 +41,21 @@ struct MemoryBlock {
     memset(ptr, 0, 4096);
     data = std::unique_ptr<char[], std::function<void(char *)>>(static_cast<char *>(ptr), [](char *p) { free(p); });
   }
+  // Move constructors
+  MemoryBlock(MemoryBlock &&other) noexcept : data(std::move(other.data)), is_free(other.is_free.load()) {}
+
+  // Move assignment operator
+  MemoryBlock &operator=(MemoryBlock &&other) noexcept {
+    if (this != &other) {
+      data = std::move(other.data);
+      is_free.store(other.is_free.load());
+    }
+    return *this;
+  }
+
+  // Disable the copy constructor and copy assignment operator
+  MemoryBlock(const MemoryBlock &) = delete;
+  MemoryBlock &operator=(const MemoryBlock &) = delete;
 };
 
 class MemoryPool {
@@ -75,18 +90,6 @@ class MemoryPool {
   void set_free_chunk();
 
 private:
-  struct MemoryBlock {
-    std::unique_ptr<char, std::function<void(char *)>> data;
-    bool is_free;
-
-    MemoryBlock() : is_free(true) {
-      void *ptr = nullptr;
-      if (posix_memalign(&ptr, MEMBLOCK_SIZE, MEMBLOCK_SIZE * 2) != 0) {
-        throw std::runtime_error("Failed to allocate aligned memory");
-      }
-      data = std::unique_ptr<char, std::function<void(char *)>>(static_cast<char *>(ptr), [](char *p) { free(p); });
-    }
-  };
   std::vector<MemoryBlock> memory_pool;              // Mempool
   std::vector<std::mutex> block_mutexes{NUM_BLOCKS}; // Partition lock array
   std::atomic<size_t> empty_blocks{NUM_BLOCKS};      // Free block count
@@ -101,7 +104,7 @@ class MemoryPool {
 
 static const size_t MAX_IDX = 256;
 static const size_t MAX_GROUPING_IDX = NUM_BLOCKS / MAX_IDX;
-static const size_t MAX_GROUP_READ = MAX_GROUPING_IDX - 2; //窗口需要预留两个空闲空间
+static const size_t MAX_GROUP_READ = MAX_GROUPING_IDX - 2; //The window needs to reserve two free Spaces
 static const size_t REM_MAX_IDX = (MAX_IDX - 1);
 static const size_t REM_MAX_GROUPING_IDX = (MAX_GROUPING_IDX - 1);
 

From 36f2638887805cc22dadb1b636be4b4cc62597cc Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Wed, 25 Sep 2024 15:12:10 +0800
Subject: [PATCH 13/17] fpga: The adaptation pack processes the new batch

---
 src/test/csrc/fpga/xdma.cpp |  7 +++----
 src/test/csrc/fpga/xdma.h   | 13 ++++---------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index 6854bcfc4..eb8bb4581 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -21,11 +21,10 @@
 
 #define XDMA_C2H_DEVICE "/dev/xdma0_c2h_"
 #define XDMA_H2C_DEVICE "/dev/xdma0_h2c_0"
-static const int dma_channel = CONFIG_DMA_CHANNELS;
 
 FpgaXdma::FpgaXdma() {
   signal(SIGINT, handle_sigint);
-  for (int i = 0; i < dma_channel; i++) {
+  for (int i = 0; i < CONFIG_DMA_CHANNELS; i++) {
     char c2h_device[64];
     sprintf(c2h_device, "%s%d", XDMA_C2H_DEVICE, i);
     xdma_c2h_fd[i] = open(c2h_device, O_RDONLY);
@@ -55,7 +54,7 @@ void FpgaXdma::start_transmit_thread() {
   if (running == true)
     return;
 
-  for (int i = 0; i < dma_channel; i++) {
+  for (int i = 0; i < CONFIG_DMA_CHANNELS; i++) {
     printf("start channel %d \n", i);
     receive_thread[i] = std::thread(&FpgaXdma::read_xdma_thread, this, i);
   }
@@ -103,7 +102,7 @@ void FpgaXdma::write_difftest_thread() {
       assert(0);
     }
     // packge unpack
-    v_difftest_Batch(packge.difftest_batch_info.io_data, packge.difftest_batch_info.io_info);
+    v_difftest_Batch((uint8_t *)packge.diff_batch_pack);
     // difftest run
     diff_packge_count.fetch_add(1, std::memory_order_relaxed);
   }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index ae9eeeeda..9319f823a 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -32,24 +32,16 @@
 #define WITH_FPGA
 
 typedef struct FpgaPackgeHead {
-  BatchInfo difftest_batch_info;
   uint8_t packge_idx;
+  char diff_batch_pack[CONFIG_DIFFTEST_BATCH_BYTELEN];
 } FpgaPackgeHead;
 
 class FpgaXdma {
 public:
-  struct FpgaPackgeHead *shmadd_recv;
-
   MemoryIdxPool xdma_mempool;
 
-  int xdma_c2h_fd[CONFIG_DMA_CHANNELS];
-  int xdma_h2c_fd;
-
   bool running = false;
 
-  std::condition_variable diff_filled_cv;
-  std::condition_variable diff_empile_cv;
-
   std::atomic<uint32_t> diff_packge_count{0};
 
   FpgaXdma();
@@ -67,6 +59,9 @@ class FpgaXdma {
   std::thread receive_thread[CONFIG_DMA_CHANNELS];
   std::thread process_thread;
 
+  int xdma_c2h_fd[CONFIG_DMA_CHANNELS];
+  int xdma_h2c_fd;
+
   static void handle_sigint(int sig);
 };
 

From bf239e8d3080180d62ac918e376d18c5087f69b7 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Wed, 25 Sep 2024 18:42:25 +0800
Subject: [PATCH 14/17] CI: add fpga-diff compile ci

---
 .github/workflows/main.yml | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ad92367af..bd81da333 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -450,3 +450,44 @@ jobs:
             make difftest_verilog PROFILE=../build/generated-src/difftest_profile.json NUMCORES=1 CONFIG=ZEL MFC=1
             make simv VCS=verilator WITH_CHISELDB=0 WITH_CONSTANTIN=0 IOTRACE_ZSTD=1
             ./build/simv +workload=../ready-to-run/microbench.bin +e=0 +diff=../ready-to-run/riscv64-nemu-interpreter-so +iotrace-name=../iotrace
+
+  # test-difftest-fpga:
+  #   runs-on: ubuntu-22.04
+
+  #   needs: test-difftest-main
+
+  #   steps:
+  #     - uses: actions/checkout@v4
+
+  #     - name: Prepare environment
+  #       run: |
+  #           cd $GITHUB_WORKSPACE/..
+  #           git config --global url."https://github.com/".insteadOf git@github.com:
+  #           git config --global url."https://".insteadOf git://
+  #           git clone https://github.com/OpenXiangShan/xs-env
+  #           cd xs-env
+  #           sudo -s ./setup-tools.sh
+  #           source ./setup.sh
+
+  #     - name: Prepare NutShell
+  #       run: |
+  #           cd $GITHUB_WORKSPACE/../xs-env
+  #           rm -r NutShell
+  #           git clone -b dev-difftest --single-branch https://github.com/OSCPU/NutShell.git
+  #           cd NutShell && git submodule update --init
+  #           rm -r difftest
+  #           cp -r $GITHUB_WORKSPACE .
+
+  #     - name: Enable -Werror for EMU Build
+  #       run: |
+  #         echo "CXX_NO_WARNING=1" >> $GITHUB_ENV
+
+  #     - name: FPGA-difftest Build
+  #       run: |
+  #           cd $GITHUB_WORKSPACE/../xs-env
+  #           source ./env.sh
+  #           cd $GITHUB_WORKSPACE/../xs-env/NutShell
+  #           source ./env.sh
+  #           make clean
+  #           make sim-verilog MILL_ARGS="--difftest-config ENBF" -j2
+  #           make fpga-build DMA_CHANNELS=2 WITH_CHISELDB=0 WITH_CONSTANTIN=0

From 5a211182bc674ce4d4333e23d6af85f3bbfecbc7 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Thu, 26 Sep 2024 11:22:58 +0800
Subject: [PATCH 15/17] fpga: svdpi.h is not referenced when fpga is used

---
 src/main/scala/DPIC.scala | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/main/scala/DPIC.scala b/src/main/scala/DPIC.scala
index 1ae6a1dd7..db4848fa1 100644
--- a/src/main/scala/DPIC.scala
+++ b/src/main/scala/DPIC.scala
@@ -297,6 +297,7 @@ private class DummyDPICBatchWrapper(
 
 object DPIC {
   val interfaces = ListBuffer.empty[(String, String, String)]
+  var defMacros = new StringBuilder()
 
   def apply(control: GatewaySinkControl, io: Valid[DifftestBundle], config: GatewayConfig): Unit = {
     val module = Module(new DummyDPICWrapper(chiselTypeOf(io), config))
@@ -314,6 +315,12 @@ object DPIC {
     module.control := control
     module.io := io
     val dpic = module.dpic
+    if (!config.isFPGA)
+      defMacros ++=
+        s"""
+           |#ifdef CONFIG_DIFFTEST_BATCH
+           |#include "svdpi.h"
+           |#endif // CONFIG_DIFFTEST_BATCH""".stripMargin
     interfaces += ((dpic.dpicFuncName, dpic.dpicFuncProto, dpic.dpicFunc))
   }
 
@@ -328,12 +335,10 @@ object DPIC {
     interfaceCpp += ""
     interfaceCpp += "#include <cstdint>"
     interfaceCpp += "#include \"diffstate.h\""
-    interfaceCpp += "#ifdef CONFIG_DIFFTEST_BATCH"
-    interfaceCpp += "#include \"svdpi.h\""
-    interfaceCpp += "#endif // CONFIG_DIFFTEST_BATCH"
     interfaceCpp += "#ifdef CONFIG_DIFFTEST_PERFCNT"
     interfaceCpp += "#include \"perf.h\""
     interfaceCpp += "#endif // CONFIG_DIFFTEST_PERFCNT"
+    interfaceCpp += defMacros.toString()
     interfaceCpp += ""
     interfaceCpp +=
       """

From b4b1fc0cca6a8fe7992e4c6752a7e6f42d42c692 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Thu, 26 Sep 2024 17:02:42 +0800
Subject: [PATCH 16/17] fpga: Burn workload to fpga ddr at boot time

---
 src/test/csrc/fpga/fpga_main.cpp |  2 +-
 src/test/csrc/fpga/xdma.cpp      | 49 +++++++++++++++++++++++++++++++-
 src/test/csrc/fpga/xdma.h        | 18 +++++++++++-
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
index 2f4ed5a11..689dfbf4b 100644
--- a/src/test/csrc/fpga/fpga_main.cpp
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -71,7 +71,7 @@ void set_diff_ref_so(char *s) {
 }
 
 void simv_init() {
-  xdma_device = new FpgaXdma;
+  xdma_device = new FpgaXdma(work_load);
   difftest_init();
 }
 
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index eb8bb4581..c7e578f7d 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -16,14 +16,20 @@
 #include "xdma.h"
 #include "mpool.h"
 #include <fcntl.h>
+#include <fstream>
 #include <iostream>
 #include <signal.h>
+#include <sys/mman.h>
 
+#define XDMA_USER       "/dev/xdma0_user"
+#define XDMA_BYPASS     "/dev/xdma0_bypass"
 #define XDMA_C2H_DEVICE "/dev/xdma0_c2h_"
 #define XDMA_H2C_DEVICE "/dev/xdma0_h2c_0"
 
-FpgaXdma::FpgaXdma() {
+FpgaXdma::FpgaXdma(const char *workload) {
   signal(SIGINT, handle_sigint);
+  ddr_load_workload(workload);
+
   for (int i = 0; i < CONFIG_DMA_CHANNELS; i++) {
     char c2h_device[64];
     sprintf(c2h_device, "%s%d", XDMA_C2H_DEVICE, i);
@@ -50,6 +56,47 @@ void FpgaXdma::handle_sigint(int sig) {
   exit(1);
 }
 
+// write xdma_bypass memory or xdma_user
+int FpgaXdma::device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value) {
+  uint64_t pg_size = sysconf(_SC_PAGE_SIZE);
+  uint64_t size = !is_bypass ? 0x1000 : 0x10000;
+  uint64_t aligned_size = (size + 0xffful) & ~0xffful;
+  uint64_t base = addr & ~0xffful;
+  uint32_t offset = addr & 0xfffu;
+  std::ifstream workload_fd;
+  int fd = -1;
+
+  if (base % pg_size != 0) {
+    printf("base must be a multiple of system page size\n");
+    return -1;
+  }
+
+  if (is_bypass)
+    fd = open(XDMA_BYPASS, O_RDWR | O_SYNC);
+  else
+    fd = open(XDMA_USER, O_RDWR | O_SYNC);
+  if (fd < 0) {
+    printf("failed to open %s\n", is_bypass ? XDMA_BYPASS : XDMA_USER);
+    return -1;
+  }
+
+  void *m_ptr = mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, base);
+  if (m_ptr == MAP_FAILED) {
+    close(fd);
+    printf("failed to mmap\n");
+    return -1;
+  }
+
+  if (is_bypass) {
+    workload_fd.read(((char *)m_ptr) + offset, size);
+  } else {
+    ((volatile uint32_t *)m_ptr)[offset >> 2] = value;
+  }
+
+  munmap(m_ptr, aligned_size);
+  close(fd);
+}
+
 void FpgaXdma::start_transmit_thread() {
   if (running == true)
     return;
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index 9319f823a..86fef90e4 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -44,11 +44,27 @@ class FpgaXdma {
 
   std::atomic<uint32_t> diff_packge_count{0};
 
-  FpgaXdma();
+  FpgaXdma(const char *workload);
   ~FpgaXdma() {
     stop_thansmit_thread();
   };
 
+  int core_reset() {
+    device_write(false, nullptr, 0x100000, 0x1);
+    device_write(false, nullptr, 0x10000, 0x8);
+  }
+
+  int core_restart() {
+    device_write(false, nullptr, 0x100000, 0);
+  }
+
+  int ddr_load_workload(const char *workload) {
+    core_reset();
+    device_write(true, workload, 0, 0);
+    core_restart();
+  }
+
+  int device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value);
   // thread api
   void start_transmit_thread();
   void stop_thansmit_thread();

From 9b1bf1fb20fc116027724ed20580cf6cba2851a9 Mon Sep 17 00:00:00 2001
From: Kami <fengkehan@bosc.ac.cn>
Date: Fri, 27 Sep 2024 15:34:30 +0800
Subject: [PATCH 17/17] fpga: Load memory for the ref module

---
 src/test/csrc/common/ram.h       |  3 +++
 src/test/csrc/fpga/fpga_main.cpp |  4 ++++
 src/test/csrc/fpga/xdma.cpp      | 17 +++++++++++------
 src/test/csrc/fpga/xdma.h        |  9 +++++----
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/test/csrc/common/ram.h b/src/test/csrc/common/ram.h
index 78d242e85..2a791a98b 100644
--- a/src/test/csrc/common/ram.h
+++ b/src/test/csrc/common/ram.h
@@ -110,6 +110,9 @@ class SimMemory {
   uint64_t get_size() {
     return memory_size;
   }
+  uint64_t get_load_img_size() {
+    return get_img_size();
+  }
   bool in_range_u8(uint64_t address) {
     return address < memory_size;
   }
diff --git a/src/test/csrc/fpga/fpga_main.cpp b/src/test/csrc/fpga/fpga_main.cpp
index 689dfbf4b..f35589d3b 100644
--- a/src/test/csrc/fpga/fpga_main.cpp
+++ b/src/test/csrc/fpga/fpga_main.cpp
@@ -18,6 +18,7 @@
 #include "difftest-dpic.h"
 #include "difftest.h"
 #include "mpool.h"
+#include "ram.h"
 #include "refproxy.h"
 #include "xdma.h"
 
@@ -60,6 +61,8 @@ int main(int argc, char *argv[]) {
     }
   }
   free(xdma_device);
+  printf("difftest releases the fpga device and exits\n");
+  exit(0);
 }
 
 void set_diff_ref_so(char *s) {
@@ -72,6 +75,7 @@ void set_diff_ref_so(char *s) {
 
 void simv_init() {
   xdma_device = new FpgaXdma(work_load);
+  init_ram(work_load, DEFAULT_EMU_RAM_SIZE);
   difftest_init();
 }
 
diff --git a/src/test/csrc/fpga/xdma.cpp b/src/test/csrc/fpga/xdma.cpp
index c7e578f7d..2ad8840c0 100644
--- a/src/test/csrc/fpga/xdma.cpp
+++ b/src/test/csrc/fpga/xdma.cpp
@@ -15,6 +15,7 @@
 ***************************************************************************************/
 #include "xdma.h"
 #include "mpool.h"
+#include "ram.h"
 #include <fcntl.h>
 #include <fstream>
 #include <iostream>
@@ -57,18 +58,17 @@ void FpgaXdma::handle_sigint(int sig) {
 }
 
 // write xdma_bypass memory or xdma_user
-int FpgaXdma::device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value) {
+void FpgaXdma::device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value) {
   uint64_t pg_size = sysconf(_SC_PAGE_SIZE);
   uint64_t size = !is_bypass ? 0x1000 : 0x10000;
   uint64_t aligned_size = (size + 0xffful) & ~0xffful;
   uint64_t base = addr & ~0xffful;
   uint32_t offset = addr & 0xfffu;
-  std::ifstream workload_fd;
   int fd = -1;
 
   if (base % pg_size != 0) {
     printf("base must be a multiple of system page size\n");
-    return -1;
+    exit(-1);
   }
 
   if (is_bypass)
@@ -77,18 +77,23 @@ int FpgaXdma::device_write(bool is_bypass, const char *workload, uint64_t addr,
     fd = open(XDMA_USER, O_RDWR | O_SYNC);
   if (fd < 0) {
     printf("failed to open %s\n", is_bypass ? XDMA_BYPASS : XDMA_USER);
-    return -1;
+    exit(-1);
   }
 
   void *m_ptr = mmap(nullptr, aligned_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, base);
   if (m_ptr == MAP_FAILED) {
     close(fd);
     printf("failed to mmap\n");
-    return -1;
+    exit(-1);
   }
 
   if (is_bypass) {
-    workload_fd.read(((char *)m_ptr) + offset, size);
+    if (simMemory->get_load_img_size() > aligned_size) {
+      printf("The loaded workload size exceeds the xdma bypass size");
+      exit(-1);
+    }
+    memcpy(static_cast<char *>(m_ptr) + offset, static_cast<const void *>(simMemory->as_ptr()),
+           simMemory->get_load_img_size());
   } else {
     ((volatile uint32_t *)m_ptr)[offset >> 2] = value;
   }
diff --git a/src/test/csrc/fpga/xdma.h b/src/test/csrc/fpga/xdma.h
index 86fef90e4..ceca1e8e2 100644
--- a/src/test/csrc/fpga/xdma.h
+++ b/src/test/csrc/fpga/xdma.h
@@ -49,22 +49,23 @@ class FpgaXdma {
     stop_thansmit_thread();
   };
 
-  int core_reset() {
+  void core_reset() {
     device_write(false, nullptr, 0x100000, 0x1);
     device_write(false, nullptr, 0x10000, 0x8);
   }
 
-  int core_restart() {
+  void core_restart() {
     device_write(false, nullptr, 0x100000, 0);
   }
 
-  int ddr_load_workload(const char *workload) {
+  void ddr_load_workload(const char *workload) {
     core_reset();
     device_write(true, workload, 0, 0);
     core_restart();
   }
 
-  int device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value);
+  void device_write(bool is_bypass, const char *workload, uint64_t addr, uint64_t value);
+
   // thread api
   void start_transmit_thread();
   void stop_thansmit_thread();