Working Calyx Implementation of AXI Read channels (#1820)

* init commit of hardcoded axi wrapper for a 'main' kernel * add axi-reads-calyx * hook up inputs to channels in the wrapper. tbd if this works * Working calyx verison of AR and R TBD if this actually implements AXI correctly. There are currently some hacks in place (marked with todos) to get this to compile, namely some splicing that doesn't consider what we actually want to splice (it just takes [31:0]) as opposed to dynamically considering actual bits we want. A few other things that should be cleaned up eventually Need to create a cocotb testbench to test correctness * Track output of compiled calyx read channel Maybe this shouldn't be here, but for now (having deleted my working directory earlier) putting it here * update gitignore to get rid of sim_build and other cocotb artifacts * Working make files for running cocotb tests Simply run make from the cocotb directory and axi-read-tests will be executed * Add xID signals for cocotb compatability We tie ARID low in our manager * Fix prefix issue on cocotb axi test bench Prefixes should not contain trailing "_" * commit to repro 'make WAVES=1' cocotb error from axi-reads-calyx.futil * axi-reads patch * sync debug * Add txn_len initialization to 16 in calyx program * AXI Read fixed to get to read channel start Got rid of "assert_val" and "block_transfer" groups and instead perform these things inside "do_ar_transfer", this is required because we cant assert valid before we drive the data correctly, so needs to happen in parallel. Currently: This seems to write 16 times to same place, this is due to hardcoding of 16 in ar transfer, not sure why address doesn't increment this is tbd (and next TODO) * Add integer byte conversion for tests on Calyx AXI testharness * WIP get reads to work. Add incr_curr_addr group This is part of read channel control sequence * remove .fst from tracking * Add more data to testbench to make waveform viewing easier * Reads seem to be terminating correctly at RLAST * AR transfers seem to work, valid is high for 1 cycle * Unreduced axi-reads-calyx.futil Also reduces data bus width to 32 * Cocotb testbench now passes * Formatted and passing axi-read-tests * Reduce and comment axi-reads-calyx.futil * remove axi-reads.v from being tracked * add a todo * add required ARPROT signal. This is hardcoded to be priviliged * rename directories to yxi/axi-calyx * remove a guard in favor of 1'b1 to simplify reading of source code --------- Co-authored-by: Rachit Nigam <rachit.nigam12@gmail.com>
calyxir · Feb 16, 2024 · 97a05a5 · 97a05a5
1 parent 02ebf99
commit 97a05a5
Show file tree

Hide file tree

Showing 4 changed files with 588 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -39,7 +39,10 @@ __pycache__
 !.vscode/launch.json
 !.vscode/tasks.json
 
+# cocotb artifacts
 tests/xilinx/cocotb/**/hdl
+sim_build/
+results.xml
 
 
 !cider-dap/calyxDebug/package.json
diff --git a/yxi/axi-calyx/axi-reads-calyx.futil b/yxi/axi-calyx/axi-reads-calyx.futil
@@ -0,0 +1,374 @@
+// ###
+// This file contains the components needed to perform read transacitons via AXI.
+// Current goal is to create a cocotb testbench that tests correctness of this.
+// See https://github.com/cucapra/calyx/issues/1733 for more information.
+//
+// This wrapper assumes it is part of a dot product computation with vectors of
+// length 16
+// It assumes a bus data width of 32
+// This is largely a work in progress and as of Nov 20 2023 is not intended to
+// actually be used for anything
+// ###
+
+import "primitives/core.futil";
+import "primitives/compile.futil";
+import "primitives/math.futil";
+import "primitives/memories.futil";
+
+
+//this goes m->s unlike read channel
+component m_arread_channel(
+  ARESET: 1,
+  ARREADY: 1
+) -> (
+  ARVALID: 1,
+  // This needs to be 64, see link below `m_axi` section.
+  ARADDR: 64,
+  // 2^ARSIZE is bytes used in transfer. For memory-mapped AXI (which is what we
+  // are doing I believe), should match width of data bus (to shell?, so 32 wide? This
+  // is 3'b010)
+  // see https://docs.xilinx.com/r/en-US/ug1393-vitis-application-acceleration/Kernel-Interface-Requirements
+  // for restrictions
+  ARSIZE: 3,
+  // in AXI4 this is 8 bits, 1-256 transfers in requested transaction.
+  ARLEN : 8, 
+  // 00 for fixed, 01 for incrementing, 2 for wrap,
+  // needs to be incr for RTL kernels (can't use wrapped of fixed
+  ARBURST : 2,
+  // required by spec. We hardwire this to priviliged access, non secure, data access.
+  ARPROT : 3) {
+  cells{
+      is_arvalid = std_reg(1);
+
+      // gets set high with ARVALID and remains high
+      arvalid_was_high = std_reg(1);
+      // TODO(nathanielnrn): should arguably eventually live in `s_axi_control`
+      // but for now will live here.
+      ref base_addr = std_reg(64);
+
+      // number of trasfers in a transaction. This is sent to subordinate
+      txn_len = std_reg(8);
+
+      // number of txns we want to occur before entire m_arread_channel is done
+      // this is internal to the channel (unlike txn_len)
+      txn_n = std_const(32,1);
+      txn_count = std_reg(32);
+      perform_reads = std_neq(32);
+      txn_adder = std_add(32);
+
+      //"block_transfer" register. need to put into a reg to avoid combinational loops
+      bt_reg = std_reg(1);
+
+
+  }
+
+  wires{
+
+      ARVALID = is_arvalid.out;
+
+      group deassert_val {
+          is_arvalid.in = 1'b0;
+          is_arvalid.write_en = 1'b1;
+          deassert_val[done] = is_arvalid.done;
+      }
+
+      group reset_bt {
+          bt_reg.in = 1'b0;
+          bt_reg.write_en = 1'b1;
+          reset_bt[done] = bt_reg.done;
+      }
+
+      // this asserts valid and defines all inputs correctly
+      // because valid should not be deasserted until handshake occurs
+      // this all needs to be one group
+      // this contains blocking logic previously in its own group
+      group do_ar_transfer {
+          //assert ARVALID
+          is_arvalid.in = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1;
+
+          // TODO(nathanielnrn): in theory should be able to get rid of arvalid_was_high
+          // but for now we will be explicit and reduce this in generation maybe. Not sure
+          // it even matters.
+          // This makes ARVALID go low after a single cycle. Without it it stays high for 2.
+          is_arvalid.in = is_arvalid.out & ARREADY & arvalid_was_high.out ? 1'b0;
+          is_arvalid.write_en = 1'b1;
+
+
+          arvalid_was_high.in = 1'b1;
+          arvalid_was_high.write_en = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1;
+
+
+        // drive output signals for transfer  
+          ARADDR = base_addr.out;
+          // see link above, needs to match data width to host.
+          // In this case 2^2 = 4 bytes = 32 bits = width of our data_bus.
+          ARSIZE = 3'b010; 
+          // For now this can be taken from .yxi, as size of mem, because we are assuming
+          // data_bus width that matches size of memory cells
+          // If we want to use bigger mems need to be able to update base addr
+          ARLEN = txn_len.out;
+          ARBURST = 2'b01; //incr
+          // privileged, non-secure, instruction access
+          ARPROT = 3'b110;
+
+
+          //done when one cycle after handshake (handshake happens for a single cycle)
+          bt_reg.in = ARREADY & is_arvalid.out ? 1'b1;
+          bt_reg.in = !(ARREADY & is_arvalid.out) ? 1'b0;
+          bt_reg.write_en = 1'b1;
+          do_ar_transfer[done] = bt_reg.out;
+      }
+
+
+      //txn bookkeeping.
+      //We are done performing reads when txn_count == txn_n
+      group txn_count_init {
+          txn_count.in = 32'b0;
+          txn_count.write_en = 1'b1;
+          txn_count_init[done] = txn_count.done;
+
+      }
+
+      group txn_len_init {
+          //TODO(nathanielnrn): 15 is good for word wide data bus. We'd
+          //expect 16 transfers. Number of transfers that occur is ARLEN + 1
+          txn_len.in = 8'd15;
+          txn_len.write_en = 1'b1;
+          txn_len_init[done] = txn_len.done;
+      }
+
+      group txn_incr {
+          txn_adder.left = txn_count.out;
+          txn_adder.right = 32'b1;
+          txn_count.in = txn_adder.out;
+          txn_count.write_en = 1'b1;
+          txn_incr[done] = txn_count.done;
+
+      }
+
+      comb group check_reads_done {
+          perform_reads.left = txn_count.out;
+          perform_reads.right = txn_n.out;
+      }
+  }
+
+  control{
+      //XXX(nathanielnrn): What is best way to offer more flexiblity beyond just a counter?
+      seq{
+          txn_count_init;
+          txn_len_init;
+          while perform_reads.out with check_reads_done{
+              seq{
+                  reset_bt;
+                  do_ar_transfer;
+                  deassert_val;
+                  txn_incr;
+              }
+          }
+      }
+  }
+}
+
+
+
+
+component m_read_channel(
+  ARESET : 1,
+  RVALID : 1,
+  RLAST  : 1,
+  RDATA  : 32, 
+  RRESP :  2,  // Note: This is generated in subordinate! had this backwards in earlier version
+) -> (
+  // NOTE: In general, according to ZipCPU we want xREADY signals to be registered
+  // because (IIRC) it helps avoid combinational loops between READY and VALID.
+  RREADY : 1,
+) {
+  cells {
+      // 16 is due to dot-product vector length assumption
+      // For this manual implementation we are just writing into this data based
+      // on the data we read from cocotb
+      ref data_received = seq_mem_d1(32, 16, 64);
+      is_rdy = std_reg(1);
+      ref curr_addr = std_reg(64); 
+
+      // registered because RLAST is high with last transfer, not after
+      // before this was registered we were terminating immediately with
+      // last transfer and not servicing it
+      n_RLAST = std_reg(1);
+
+      // TODO: get this width from yxi
+      read_data_reg = std_reg(32);
+
+      //address of seq_d1_mem we are writing to
+      curr_addr_adder = std_add(64);
+
+      // block_transfer reg to avoid combinational loops
+      bt_reg = std_reg(1);
+
+  }
+  wires{
+
+      RREADY = is_rdy.out;
+      data_received.read_en = 1'b0;
+
+      group init_n_RLAST {
+          n_RLAST.in = 1'b1;
+          n_RLAST.write_en = 1'b1;
+          init_n_RLAST[done] = n_RLAST.done;
+      }
+
+      // Used to block any servicing until handshake occurs. 
+      group reset_bt {
+          bt_reg.in = 1'b0;
+          bt_reg.write_en = 1'b1;
+          reset_bt[done] = bt_reg.done;
+      }
+
+      // NOTE: xVALID signals must be high until xREADY is high as well, so this works
+      // because if xREADY is high (is_rdy.out) then RVALID being high makes 1 flip
+      // and group will be done by bt_reg.out
+      group block_transfer {
+        // set RREADY high
+        // TODO (nathanielnrn): technically we can make RREADY depend on on RVALID (but not vice versa).
+        // Could we simplify this we just making things ready when we are in
+        // block_transfer && RVALID?
+
+        //NOTE: is_rdy.in = 1'b1; does not work, it leaves RREADY high for 2 cycles
+        // this both asserts and deasserts one cycle later
+        // TODO(nathanielnrn): Spec recommends defaulting xREADY high as it
+        // can get rid of extra cycle. Maybe doing so here would be useful?
+        // as opposed to waiting for RVALID
+        is_rdy.in = !(RVALID & is_rdy.out) ? 1'b1;
+        is_rdy.in = RVALID & is_rdy.out ? 1'b0;
+        is_rdy.write_en = 1'b1;
+
+
+        //store the data we want to write
+        read_data_reg.in = RDATA;
+        read_data_reg.write_en = is_rdy.out;
+
+        //update n_RLAST reg
+        n_RLAST.in = RLAST ? 1'b0;
+        n_RLAST.in = !RLAST ? 1'b1;
+        n_RLAST.write_en = 1'b1;
+
+
+        // we are done after handshake
+        bt_reg.in = is_rdy.out & RVALID ? 1'b1;
+        bt_reg.in = !(is_rdy.out & RVALID) ? 1'b0;
+        bt_reg.write_en = 1'b1;
+        block_transfer[done] = bt_reg.out;
+      }
+
+      group receive_r_transfer{
+          // keep RREADY low;
+          is_rdy.in = 1'b0;
+          is_rdy.write_en = 1'b1;
+
+          //write the data we received during transfer to seq_d1_mem
+          data_received.addr0 = curr_addr.out;
+          data_received.write_en = 1'b1;
+          data_received.write_data = read_data_reg.out;
+          receive_r_transfer[done] = data_received.write_done;
+
+      }
+
+      group incr_curr_addr{
+          curr_addr_adder.left = 64'd1 ;
+          curr_addr_adder.right = curr_addr.out;
+          curr_addr.in = curr_addr_adder.out;
+          curr_addr.write_en = 1'b1;
+          incr_curr_addr[done] = curr_addr.done;
+      }
+  }
+  control{
+      init_n_RLAST;
+      while n_RLAST.out{
+          seq{
+              reset_bt;
+              block_transfer;
+              receive_r_transfer;
+              incr_curr_addr;
+          }
+      }
+  }
+}
+
+//TODO(nathanielnrn): this is axi_wrapper, prefer to use @toplevel attribute but its not working
+// See individual channel components for explanations of signals
+component main(
+    m_ARESET : 1,
+    m_ARREADY : 1,
+
+    m_RVALID : 1,
+    m_RLAST : 1,
+    m_RDATA : 32,
+    m_RRESP : 2,
+    //NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently.
+    m_RID : 1,
+) -> (
+    m_ARVALID : 1,
+    m_ARADDR: 64,
+    m_ARSIZE: 3,
+    m_ARLEN : 8, 
+    m_ARBURST : 2,
+
+    m_RREADY : 1,
+    //NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently.
+    m_ARID : 1
+) {
+    cells{
+        vec1_data = seq_mem_d1(32,16,64);
+        output_data = seq_mem_d1(32,1,0);
+
+        curr_addr = std_reg(64);
+        base_addr = std_reg(64);
+
+        read_channel = m_read_channel();
+        arread_channel = m_arread_channel();
+
+    }
+
+    wires{
+
+        m_ARID = 1'b0;
+
+        group set_curr_to_base_addr{
+            curr_addr.in = base_addr.out;
+            curr_addr.write_en = 1'b1;
+            set_curr_to_base_addr[done] = curr_addr.done;
+        }
+    }
+    control{
+        seq{
+            invoke arread_channel[base_addr = base_addr]
+            (
+            ARESET = m_ARESET,
+            ARREADY = m_ARREADY
+            )
+            (
+            ARVALID = m_ARVALID,
+            ARADDR = m_ARADDR,
+            ARSIZE = m_ARSIZE,
+            ARLEN = m_ARLEN,
+            ARBURST = m_ARBURST
+            );
+
+            set_curr_to_base_addr;
+
+            invoke read_channel[data_received = vec1_data, curr_addr = curr_addr]
+            (
+            ARESET = m_ARESET,
+            RVALID = m_RVALID,
+            RLAST = m_RLAST,
+            RDATA = m_RDATA,
+            RRESP = m_RRESP
+            )
+            (
+            RREADY = m_RREADY
+            );
+        }
+    }
+
+
+}