Skip to content

Commit

Permalink
Working Calyx Implementation of AXI Read channels (#1820)
Browse files Browse the repository at this point in the history
* init commit of hardcoded axi wrapper for a 'main' kernel

* add axi-reads-calyx

* hook up inputs to channels in the wrapper. tbd if this works

* Working calyx verison of AR and R

TBD if this actually implements AXI correctly.

There are currently some hacks in place (marked with todos)
to get this to compile, namely some splicing
that doesn't consider what we actually want to splice
(it just takes [31:0]) as opposed to dynamically considering actual
bits we want.

A few other things that should be cleaned up eventually

Need to create a cocotb testbench to test correctness

* Track output of compiled calyx read channel

Maybe this shouldn't be here, but for now (having deleted my
working directory earlier) putting it here

* update gitignore to get rid of sim_build and other cocotb artifacts

* Working make files  for running cocotb tests

Simply run make from the cocotb directory and axi-read-tests
will be executed

* Add xID signals for cocotb compatability

We tie ARID low in our manager

* Fix prefix issue on cocotb axi test bench

Prefixes should not contain trailing "_"

* commit to repro 'make WAVES=1' cocotb error from axi-reads-calyx.futil

* axi-reads patch

* sync debug

* Add txn_len initialization to 16 in calyx program

* AXI Read fixed to get to read channel start

Got rid of "assert_val" and "block_transfer" groups
and instead perform these things inside "do_ar_transfer", this is
required because we cant assert valid before we drive the data
correctly, so needs to happen in parallel.

Currently: This seems to write 16 times to same place, this is due to
hardcoding of 16 in ar transfer, not sure why address doesn't
increment this is tbd (and next TODO)

* Add integer byte conversion for tests on Calyx AXI testharness

* WIP get reads to work. Add incr_curr_addr group

This is part of read channel control sequence

* remove .fst from tracking

* Add more data to testbench to make waveform viewing easier

* Reads seem to be terminating correctly at RLAST

* AR transfers seem to work, valid is high for 1 cycle

* Unreduced axi-reads-calyx.futil

Also reduces data bus width to 32

* Cocotb testbench now passes

* Formatted and passing axi-read-tests

* Reduce and comment axi-reads-calyx.futil

* remove axi-reads.v from being tracked

* add a todo

* add required ARPROT signal. This is hardcoded to be priviliged

* rename directories to yxi/axi-calyx

* remove a guard in favor of 1'b1 to simplify reading of source code

---------

Co-authored-by: Rachit Nigam <rachit.nigam12@gmail.com>
  • Loading branch information
nathanielnrn and rachitnigam committed Feb 16, 2024
1 parent 02ebf99 commit 97a05a5
Show file tree
Hide file tree
Showing 4 changed files with 588 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ __pycache__
!.vscode/launch.json
!.vscode/tasks.json

# cocotb artifacts
tests/xilinx/cocotb/**/hdl
sim_build/
results.xml


!cider-dap/calyxDebug/package.json
374 changes: 374 additions & 0 deletions yxi/axi-calyx/axi-reads-calyx.futil
Original file line number Diff line number Diff line change
@@ -0,0 +1,374 @@
// ###
// This file contains the components needed to perform read transacitons via AXI.
// Current goal is to create a cocotb testbench that tests correctness of this.
// See https://github.com/cucapra/calyx/issues/1733 for more information.
//
// This wrapper assumes it is part of a dot product computation with vectors of
// length 16
// It assumes a bus data width of 32
// This is largely a work in progress and as of Nov 20 2023 is not intended to
// actually be used for anything
// ###

import "primitives/core.futil";
import "primitives/compile.futil";
import "primitives/math.futil";
import "primitives/memories.futil";


//this goes m->s unlike read channel
component m_arread_channel(
ARESET: 1,
ARREADY: 1
) -> (
ARVALID: 1,
// This needs to be 64, see link below `m_axi` section.
ARADDR: 64,
// 2^ARSIZE is bytes used in transfer. For memory-mapped AXI (which is what we
// are doing I believe), should match width of data bus (to shell?, so 32 wide? This
// is 3'b010)
// see https://docs.xilinx.com/r/en-US/ug1393-vitis-application-acceleration/Kernel-Interface-Requirements
// for restrictions
ARSIZE: 3,
// in AXI4 this is 8 bits, 1-256 transfers in requested transaction.
ARLEN : 8,
// 00 for fixed, 01 for incrementing, 2 for wrap,
// needs to be incr for RTL kernels (can't use wrapped of fixed
ARBURST : 2,
// required by spec. We hardwire this to priviliged access, non secure, data access.
ARPROT : 3) {
cells{
is_arvalid = std_reg(1);

// gets set high with ARVALID and remains high
arvalid_was_high = std_reg(1);
// TODO(nathanielnrn): should arguably eventually live in `s_axi_control`
// but for now will live here.
ref base_addr = std_reg(64);

// number of trasfers in a transaction. This is sent to subordinate
txn_len = std_reg(8);

// number of txns we want to occur before entire m_arread_channel is done
// this is internal to the channel (unlike txn_len)
txn_n = std_const(32,1);
txn_count = std_reg(32);
perform_reads = std_neq(32);
txn_adder = std_add(32);

//"block_transfer" register. need to put into a reg to avoid combinational loops
bt_reg = std_reg(1);


}

wires{

ARVALID = is_arvalid.out;

group deassert_val {
is_arvalid.in = 1'b0;
is_arvalid.write_en = 1'b1;
deassert_val[done] = is_arvalid.done;
}

group reset_bt {
bt_reg.in = 1'b0;
bt_reg.write_en = 1'b1;
reset_bt[done] = bt_reg.done;
}

// this asserts valid and defines all inputs correctly
// because valid should not be deasserted until handshake occurs
// this all needs to be one group
// this contains blocking logic previously in its own group
group do_ar_transfer {
//assert ARVALID
is_arvalid.in = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1;

// TODO(nathanielnrn): in theory should be able to get rid of arvalid_was_high
// but for now we will be explicit and reduce this in generation maybe. Not sure
// it even matters.
// This makes ARVALID go low after a single cycle. Without it it stays high for 2.
is_arvalid.in = is_arvalid.out & ARREADY & arvalid_was_high.out ? 1'b0;
is_arvalid.write_en = 1'b1;


arvalid_was_high.in = 1'b1;
arvalid_was_high.write_en = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1;


// drive output signals for transfer
ARADDR = base_addr.out;
// see link above, needs to match data width to host.
// In this case 2^2 = 4 bytes = 32 bits = width of our data_bus.
ARSIZE = 3'b010;
// For now this can be taken from .yxi, as size of mem, because we are assuming
// data_bus width that matches size of memory cells
// If we want to use bigger mems need to be able to update base addr
ARLEN = txn_len.out;
ARBURST = 2'b01; //incr
// privileged, non-secure, instruction access
ARPROT = 3'b110;


//done when one cycle after handshake (handshake happens for a single cycle)
bt_reg.in = ARREADY & is_arvalid.out ? 1'b1;
bt_reg.in = !(ARREADY & is_arvalid.out) ? 1'b0;
bt_reg.write_en = 1'b1;
do_ar_transfer[done] = bt_reg.out;
}


//txn bookkeeping.
//We are done performing reads when txn_count == txn_n
group txn_count_init {
txn_count.in = 32'b0;
txn_count.write_en = 1'b1;
txn_count_init[done] = txn_count.done;

}

group txn_len_init {
//TODO(nathanielnrn): 15 is good for word wide data bus. We'd
//expect 16 transfers. Number of transfers that occur is ARLEN + 1
txn_len.in = 8'd15;
txn_len.write_en = 1'b1;
txn_len_init[done] = txn_len.done;
}

group txn_incr {
txn_adder.left = txn_count.out;
txn_adder.right = 32'b1;
txn_count.in = txn_adder.out;
txn_count.write_en = 1'b1;
txn_incr[done] = txn_count.done;

}

comb group check_reads_done {
perform_reads.left = txn_count.out;
perform_reads.right = txn_n.out;
}
}

control{
//XXX(nathanielnrn): What is best way to offer more flexiblity beyond just a counter?
seq{
txn_count_init;
txn_len_init;
while perform_reads.out with check_reads_done{
seq{
reset_bt;
do_ar_transfer;
deassert_val;
txn_incr;
}
}
}
}
}




component m_read_channel(
ARESET : 1,
RVALID : 1,
RLAST : 1,
RDATA : 32,
RRESP : 2, // Note: This is generated in subordinate! had this backwards in earlier version
) -> (
// NOTE: In general, according to ZipCPU we want xREADY signals to be registered
// because (IIRC) it helps avoid combinational loops between READY and VALID.
RREADY : 1,
) {
cells {
// 16 is due to dot-product vector length assumption
// For this manual implementation we are just writing into this data based
// on the data we read from cocotb
ref data_received = seq_mem_d1(32, 16, 64);
is_rdy = std_reg(1);
ref curr_addr = std_reg(64);

// registered because RLAST is high with last transfer, not after
// before this was registered we were terminating immediately with
// last transfer and not servicing it
n_RLAST = std_reg(1);

// TODO: get this width from yxi
read_data_reg = std_reg(32);

//address of seq_d1_mem we are writing to
curr_addr_adder = std_add(64);

// block_transfer reg to avoid combinational loops
bt_reg = std_reg(1);

}
wires{

RREADY = is_rdy.out;
data_received.read_en = 1'b0;

group init_n_RLAST {
n_RLAST.in = 1'b1;
n_RLAST.write_en = 1'b1;
init_n_RLAST[done] = n_RLAST.done;
}

// Used to block any servicing until handshake occurs.
group reset_bt {
bt_reg.in = 1'b0;
bt_reg.write_en = 1'b1;
reset_bt[done] = bt_reg.done;
}

// NOTE: xVALID signals must be high until xREADY is high as well, so this works
// because if xREADY is high (is_rdy.out) then RVALID being high makes 1 flip
// and group will be done by bt_reg.out
group block_transfer {
// set RREADY high
// TODO (nathanielnrn): technically we can make RREADY depend on on RVALID (but not vice versa).
// Could we simplify this we just making things ready when we are in
// block_transfer && RVALID?

//NOTE: is_rdy.in = 1'b1; does not work, it leaves RREADY high for 2 cycles
// this both asserts and deasserts one cycle later
// TODO(nathanielnrn): Spec recommends defaulting xREADY high as it
// can get rid of extra cycle. Maybe doing so here would be useful?
// as opposed to waiting for RVALID
is_rdy.in = !(RVALID & is_rdy.out) ? 1'b1;
is_rdy.in = RVALID & is_rdy.out ? 1'b0;
is_rdy.write_en = 1'b1;


//store the data we want to write
read_data_reg.in = RDATA;
read_data_reg.write_en = is_rdy.out;

//update n_RLAST reg
n_RLAST.in = RLAST ? 1'b0;
n_RLAST.in = !RLAST ? 1'b1;
n_RLAST.write_en = 1'b1;


// we are done after handshake
bt_reg.in = is_rdy.out & RVALID ? 1'b1;
bt_reg.in = !(is_rdy.out & RVALID) ? 1'b0;
bt_reg.write_en = 1'b1;
block_transfer[done] = bt_reg.out;
}

group receive_r_transfer{
// keep RREADY low;
is_rdy.in = 1'b0;
is_rdy.write_en = 1'b1;

//write the data we received during transfer to seq_d1_mem
data_received.addr0 = curr_addr.out;
data_received.write_en = 1'b1;
data_received.write_data = read_data_reg.out;
receive_r_transfer[done] = data_received.write_done;

}

group incr_curr_addr{
curr_addr_adder.left = 64'd1 ;
curr_addr_adder.right = curr_addr.out;
curr_addr.in = curr_addr_adder.out;
curr_addr.write_en = 1'b1;
incr_curr_addr[done] = curr_addr.done;
}
}
control{
init_n_RLAST;
while n_RLAST.out{
seq{
reset_bt;
block_transfer;
receive_r_transfer;
incr_curr_addr;
}
}
}
}

//TODO(nathanielnrn): this is axi_wrapper, prefer to use @toplevel attribute but its not working
// See individual channel components for explanations of signals
component main(
m_ARESET : 1,
m_ARREADY : 1,

m_RVALID : 1,
m_RLAST : 1,
m_RDATA : 32,
m_RRESP : 2,
//NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently.
m_RID : 1,
) -> (
m_ARVALID : 1,
m_ARADDR: 64,
m_ARSIZE: 3,
m_ARLEN : 8,
m_ARBURST : 2,

m_RREADY : 1,
//NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently.
m_ARID : 1
) {
cells{
vec1_data = seq_mem_d1(32,16,64);
output_data = seq_mem_d1(32,1,0);

curr_addr = std_reg(64);
base_addr = std_reg(64);

read_channel = m_read_channel();
arread_channel = m_arread_channel();

}

wires{

m_ARID = 1'b0;

group set_curr_to_base_addr{
curr_addr.in = base_addr.out;
curr_addr.write_en = 1'b1;
set_curr_to_base_addr[done] = curr_addr.done;
}
}
control{
seq{
invoke arread_channel[base_addr = base_addr]
(
ARESET = m_ARESET,
ARREADY = m_ARREADY
)
(
ARVALID = m_ARVALID,
ARADDR = m_ARADDR,
ARSIZE = m_ARSIZE,
ARLEN = m_ARLEN,
ARBURST = m_ARBURST
);

set_curr_to_base_addr;

invoke read_channel[data_received = vec1_data, curr_addr = curr_addr]
(
ARESET = m_ARESET,
RVALID = m_RVALID,
RLAST = m_RLAST,
RDATA = m_RDATA,
RRESP = m_RRESP
)
(
RREADY = m_RREADY
);
}
}


}
Loading

0 comments on commit 97a05a5

Please sign in to comment.