diff --git a/.gitignore b/.gitignore index 8a97d3c2f..ccf0b37de 100644 --- a/.gitignore +++ b/.gitignore @@ -39,7 +39,10 @@ __pycache__ !.vscode/launch.json !.vscode/tasks.json +# cocotb artifacts tests/xilinx/cocotb/**/hdl +sim_build/ +results.xml !cider-dap/calyxDebug/package.json diff --git a/yxi/axi-calyx/axi-reads-calyx.futil b/yxi/axi-calyx/axi-reads-calyx.futil new file mode 100644 index 000000000..7b5760e72 --- /dev/null +++ b/yxi/axi-calyx/axi-reads-calyx.futil @@ -0,0 +1,374 @@ +// ### +// This file contains the components needed to perform read transacitons via AXI. +// Current goal is to create a cocotb testbench that tests correctness of this. +// See https://github.com/cucapra/calyx/issues/1733 for more information. +// +// This wrapper assumes it is part of a dot product computation with vectors of +// length 16 +// It assumes a bus data width of 32 +// This is largely a work in progress and as of Nov 20 2023 is not intended to +// actually be used for anything +// ### + +import "primitives/core.futil"; +import "primitives/compile.futil"; +import "primitives/math.futil"; +import "primitives/memories.futil"; + + +//this goes m->s unlike read channel +component m_arread_channel( + ARESET: 1, + ARREADY: 1 +) -> ( + ARVALID: 1, + // This needs to be 64, see link below `m_axi` section. + ARADDR: 64, + // 2^ARSIZE is bytes used in transfer. For memory-mapped AXI (which is what we + // are doing I believe), should match width of data bus (to shell?, so 32 wide? This + // is 3'b010) + // see https://docs.xilinx.com/r/en-US/ug1393-vitis-application-acceleration/Kernel-Interface-Requirements + // for restrictions + ARSIZE: 3, + // in AXI4 this is 8 bits, 1-256 transfers in requested transaction. + ARLEN : 8, + // 00 for fixed, 01 for incrementing, 2 for wrap, + // needs to be incr for RTL kernels (can't use wrapped of fixed + ARBURST : 2, + // required by spec. We hardwire this to priviliged access, non secure, data access. + ARPROT : 3) { + cells{ + is_arvalid = std_reg(1); + + // gets set high with ARVALID and remains high + arvalid_was_high = std_reg(1); + // TODO(nathanielnrn): should arguably eventually live in `s_axi_control` + // but for now will live here. + ref base_addr = std_reg(64); + + // number of trasfers in a transaction. This is sent to subordinate + txn_len = std_reg(8); + + // number of txns we want to occur before entire m_arread_channel is done + // this is internal to the channel (unlike txn_len) + txn_n = std_const(32,1); + txn_count = std_reg(32); + perform_reads = std_neq(32); + txn_adder = std_add(32); + + //"block_transfer" register. need to put into a reg to avoid combinational loops + bt_reg = std_reg(1); + + + } + + wires{ + + ARVALID = is_arvalid.out; + + group deassert_val { + is_arvalid.in = 1'b0; + is_arvalid.write_en = 1'b1; + deassert_val[done] = is_arvalid.done; + } + + group reset_bt { + bt_reg.in = 1'b0; + bt_reg.write_en = 1'b1; + reset_bt[done] = bt_reg.done; + } + + // this asserts valid and defines all inputs correctly + // because valid should not be deasserted until handshake occurs + // this all needs to be one group + // this contains blocking logic previously in its own group + group do_ar_transfer { + //assert ARVALID + is_arvalid.in = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1; + + // TODO(nathanielnrn): in theory should be able to get rid of arvalid_was_high + // but for now we will be explicit and reduce this in generation maybe. Not sure + // it even matters. + // This makes ARVALID go low after a single cycle. Without it it stays high for 2. + is_arvalid.in = is_arvalid.out & ARREADY & arvalid_was_high.out ? 1'b0; + is_arvalid.write_en = 1'b1; + + + arvalid_was_high.in = 1'b1; + arvalid_was_high.write_en = !(is_arvalid.out & ARREADY) & !arvalid_was_high.out ? 1'b1; + + + // drive output signals for transfer + ARADDR = base_addr.out; + // see link above, needs to match data width to host. + // In this case 2^2 = 4 bytes = 32 bits = width of our data_bus. + ARSIZE = 3'b010; + // For now this can be taken from .yxi, as size of mem, because we are assuming + // data_bus width that matches size of memory cells + // If we want to use bigger mems need to be able to update base addr + ARLEN = txn_len.out; + ARBURST = 2'b01; //incr + // privileged, non-secure, instruction access + ARPROT = 3'b110; + + + //done when one cycle after handshake (handshake happens for a single cycle) + bt_reg.in = ARREADY & is_arvalid.out ? 1'b1; + bt_reg.in = !(ARREADY & is_arvalid.out) ? 1'b0; + bt_reg.write_en = 1'b1; + do_ar_transfer[done] = bt_reg.out; + } + + + //txn bookkeeping. + //We are done performing reads when txn_count == txn_n + group txn_count_init { + txn_count.in = 32'b0; + txn_count.write_en = 1'b1; + txn_count_init[done] = txn_count.done; + + } + + group txn_len_init { + //TODO(nathanielnrn): 15 is good for word wide data bus. We'd + //expect 16 transfers. Number of transfers that occur is ARLEN + 1 + txn_len.in = 8'd15; + txn_len.write_en = 1'b1; + txn_len_init[done] = txn_len.done; + } + + group txn_incr { + txn_adder.left = txn_count.out; + txn_adder.right = 32'b1; + txn_count.in = txn_adder.out; + txn_count.write_en = 1'b1; + txn_incr[done] = txn_count.done; + + } + + comb group check_reads_done { + perform_reads.left = txn_count.out; + perform_reads.right = txn_n.out; + } + } + + control{ + //XXX(nathanielnrn): What is best way to offer more flexiblity beyond just a counter? + seq{ + txn_count_init; + txn_len_init; + while perform_reads.out with check_reads_done{ + seq{ + reset_bt; + do_ar_transfer; + deassert_val; + txn_incr; + } + } + } + } +} + + + + +component m_read_channel( + ARESET : 1, + RVALID : 1, + RLAST : 1, + RDATA : 32, + RRESP : 2, // Note: This is generated in subordinate! had this backwards in earlier version +) -> ( + // NOTE: In general, according to ZipCPU we want xREADY signals to be registered + // because (IIRC) it helps avoid combinational loops between READY and VALID. + RREADY : 1, +) { + cells { + // 16 is due to dot-product vector length assumption + // For this manual implementation we are just writing into this data based + // on the data we read from cocotb + ref data_received = seq_mem_d1(32, 16, 64); + is_rdy = std_reg(1); + ref curr_addr = std_reg(64); + + // registered because RLAST is high with last transfer, not after + // before this was registered we were terminating immediately with + // last transfer and not servicing it + n_RLAST = std_reg(1); + + // TODO: get this width from yxi + read_data_reg = std_reg(32); + + //address of seq_d1_mem we are writing to + curr_addr_adder = std_add(64); + + // block_transfer reg to avoid combinational loops + bt_reg = std_reg(1); + + } + wires{ + + RREADY = is_rdy.out; + data_received.read_en = 1'b0; + + group init_n_RLAST { + n_RLAST.in = 1'b1; + n_RLAST.write_en = 1'b1; + init_n_RLAST[done] = n_RLAST.done; + } + + // Used to block any servicing until handshake occurs. + group reset_bt { + bt_reg.in = 1'b0; + bt_reg.write_en = 1'b1; + reset_bt[done] = bt_reg.done; + } + + // NOTE: xVALID signals must be high until xREADY is high as well, so this works + // because if xREADY is high (is_rdy.out) then RVALID being high makes 1 flip + // and group will be done by bt_reg.out + group block_transfer { + // set RREADY high + // TODO (nathanielnrn): technically we can make RREADY depend on on RVALID (but not vice versa). + // Could we simplify this we just making things ready when we are in + // block_transfer && RVALID? + + //NOTE: is_rdy.in = 1'b1; does not work, it leaves RREADY high for 2 cycles + // this both asserts and deasserts one cycle later + // TODO(nathanielnrn): Spec recommends defaulting xREADY high as it + // can get rid of extra cycle. Maybe doing so here would be useful? + // as opposed to waiting for RVALID + is_rdy.in = !(RVALID & is_rdy.out) ? 1'b1; + is_rdy.in = RVALID & is_rdy.out ? 1'b0; + is_rdy.write_en = 1'b1; + + + //store the data we want to write + read_data_reg.in = RDATA; + read_data_reg.write_en = is_rdy.out; + + //update n_RLAST reg + n_RLAST.in = RLAST ? 1'b0; + n_RLAST.in = !RLAST ? 1'b1; + n_RLAST.write_en = 1'b1; + + + // we are done after handshake + bt_reg.in = is_rdy.out & RVALID ? 1'b1; + bt_reg.in = !(is_rdy.out & RVALID) ? 1'b0; + bt_reg.write_en = 1'b1; + block_transfer[done] = bt_reg.out; + } + + group receive_r_transfer{ + // keep RREADY low; + is_rdy.in = 1'b0; + is_rdy.write_en = 1'b1; + + //write the data we received during transfer to seq_d1_mem + data_received.addr0 = curr_addr.out; + data_received.write_en = 1'b1; + data_received.write_data = read_data_reg.out; + receive_r_transfer[done] = data_received.write_done; + + } + + group incr_curr_addr{ + curr_addr_adder.left = 64'd1 ; + curr_addr_adder.right = curr_addr.out; + curr_addr.in = curr_addr_adder.out; + curr_addr.write_en = 1'b1; + incr_curr_addr[done] = curr_addr.done; + } + } + control{ + init_n_RLAST; + while n_RLAST.out{ + seq{ + reset_bt; + block_transfer; + receive_r_transfer; + incr_curr_addr; + } + } + } +} + +//TODO(nathanielnrn): this is axi_wrapper, prefer to use @toplevel attribute but its not working +// See individual channel components for explanations of signals +component main( + m_ARESET : 1, + m_ARREADY : 1, + + m_RVALID : 1, + m_RLAST : 1, + m_RDATA : 32, + m_RRESP : 2, + //NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently. + m_RID : 1, +) -> ( + m_ARVALID : 1, + m_ARADDR: 64, + m_ARSIZE: 3, + m_ARLEN : 8, + m_ARBURST : 2, + + m_RREADY : 1, + //NOTE: Only used for cocotb compatability, doesn't do anything within the wrapper itself currently. + m_ARID : 1 +) { + cells{ + vec1_data = seq_mem_d1(32,16,64); + output_data = seq_mem_d1(32,1,0); + + curr_addr = std_reg(64); + base_addr = std_reg(64); + + read_channel = m_read_channel(); + arread_channel = m_arread_channel(); + + } + + wires{ + + m_ARID = 1'b0; + + group set_curr_to_base_addr{ + curr_addr.in = base_addr.out; + curr_addr.write_en = 1'b1; + set_curr_to_base_addr[done] = curr_addr.done; + } + } + control{ + seq{ + invoke arread_channel[base_addr = base_addr] + ( + ARESET = m_ARESET, + ARREADY = m_ARREADY + ) + ( + ARVALID = m_ARVALID, + ARADDR = m_ARADDR, + ARSIZE = m_ARSIZE, + ARLEN = m_ARLEN, + ARBURST = m_ARBURST + ); + + set_curr_to_base_addr; + + invoke read_channel[data_received = vec1_data, curr_addr = curr_addr] + ( + ARESET = m_ARESET, + RVALID = m_RVALID, + RLAST = m_RLAST, + RDATA = m_RDATA, + RRESP = m_RRESP + ) + ( + RREADY = m_RREADY + ); + } + } + + +} diff --git a/yxi/axi-calyx/cocotb/Makefile b/yxi/axi-calyx/cocotb/Makefile new file mode 100644 index 000000000..3ae9a87d4 --- /dev/null +++ b/yxi/axi-calyx/cocotb/Makefile @@ -0,0 +1,23 @@ +# Makefile + +# defaults +SIM ?= icarus +TOPLEVEL_LANG ?= verilog + + +#Needed to extract desired test from runt invocation + +VERILOG_SOURCES += $(PWD)/../outputs/axi-reads.v + +#Defines build directory, if left to default only a single computation is run +SIM_BUILD=sim_build/axi-reads + +# TOPLEVEL is the name of the toplevel module in your Verilog or VHDL file +TOPLEVEL = main + +# MODULE is the basename of the Python test file +MODULE = axi-read-tests + + +# include cocotb's make rules to take care of the simulator setup +include $(shell cocotb-config --makefiles)/Makefile.sim diff --git a/yxi/axi-calyx/cocotb/axi-read-tests.py b/yxi/axi-calyx/cocotb/axi-read-tests.py new file mode 100644 index 000000000..1c54e4fde --- /dev/null +++ b/yxi/axi-calyx/cocotb/axi-read-tests.py @@ -0,0 +1,188 @@ +import cocotb +from cocotb.clock import Clock +from cocotbext.axi import AxiReadBus, AxiRamRead +from cocotb.triggers import Timer, ClockCycles +import mmap +import struct +from typing import Union, Literal, List + +# TODO(nathanielnrn): If optional signals like WSTRB are not recognized, +# install cocotb-bus directly from github, as 0.2.1 has a bug + + +# Reads 16 elements from mmap of 16*4 bytes. Writes these elements to 16 cells in calyx defined seq_d1 mem. +@cocotb.test() +async def read_channels_happy_path(main): + happy_data_vec = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 32768, + ] + await read_axi_test_helper(main, happy_data_vec, happy_data_vec) + + +# Adding extra data to backing mmap does not ruin reading of 16 elements and writing them correctly. +@cocotb.test() +async def read_channels_extra_mmap_data(main): + large_data_vec = [ + 1, + 2, + 4, + 8, + 16, + 32, + 64, + 128, + 256, + 512, + 1024, + 2048, + 4096, + 8192, + 16384, + 32768, + 2**32 - 1, + ] + await read_axi_test_helper(main, large_data_vec, large_data_vec[0:16]) + + +# Using a small mmap will have the AXI read loop back around +# NOTE: From what I can tell, this is not part of AXI spec, but rather behavior of cocotb AXI. +# Still think it is useful to test for to see if anything breaks with this +@cocotb.test() +async def read_channels_small_mmap_data(main): + small_data_vec = [1, 2, 4, 8, 2**32 - 1] + expected_data_vec = [ + 1, + 2, + 4, + 8, + 2**32 - 1, + 1, + 2, + 4, + 8, + 2**32 - 1, + 1, + 2, + 4, + 8, + 2**32 - 1, + 1, + ] + await read_axi_test_helper(main, small_data_vec, expected_data_vec) + + +async def read_axi_test_helper( + module, data_vec: List[int], expected: List[int], mmap_size: int = None +): + """Create an mmap with data of `data_vec` and use this to initialize + a cocotb-axi-ram (read only) with this data. Assert that the data that + our AXI program reads has been written to the memory inside our Calyx program + correctly and matches `expected.` + + mmap_size is in bytes. + + """ + cocotb.start_soon(Clock(module.clk, 2, units="ns").start()) + + # Assert reset for 5 cycles (reuqired for Calyx interfacing) + module.reset.value = 1 + await ClockCycles(module.clk, 5) # wait a bit + module.reset.value = 0 + + # Start the execution + module.go.value = 1 + + if mmap_size is None: + # 4 bytes per integer + mmap_size = len(data_vec) * 4 + # anonymous mmep for now to back axiram + memmap = mmap.mmap(-1, mmap_size) + axi_ram_read = AxiRamRead( + # NOTE: prefix should not contain the final "_" + AxiReadBus.from_prefix(module, "m"), + module.clk, + module.reset, + # size in bytes + size=mmap_size, + mem=memmap, + ) + + data_vec_bytes = int_to_bytes(data_vec) + memmap.seek(0) + memmap.write(data_vec_bytes) + memmap.seek(0) + + await Timer(20, "ns") + # axi_ram_read.hexdump(0x0000, mmap_size, prefix="RAM") + + await Timer(500, "ns") + assert ( + cocotb_mem_to_ints(module.vec1_data) == expected + ), f"main.vec1_data: {cocotb_mem_to_ints(module.vec1_data)} does not contain the data in expected: {expected}." + + +# TODO(nathanielnrn): Decide between these and xilinx cocotb tests, refactor out +# after determining which is better + + +# Returns 4-byte representation of an integer +# Does not yet support unsigned, can be changed by changing to `i` as opposed to `I`. +# Not supported cause haven't yet thought about how AXI is affected +def int_to_bytes( + integers, byteorder: Union[Literal["little"], Literal["big"]] = "little" +): + frmt = get_format(byteorder, integers) + return struct.pack(frmt, *integers) + + +# returns iterable of ints or a single int depending on size of bytes argument +def bytes_to_int(bytes, byteorder="little"): + assert len(bytes) % 4 == 0, "bytes length not divisble by 4." + frmt = get_format(byteorder, bytes) + ints = struct.unpack(frmt, bytes) + if len(ints) == 1: + return ints[0] + return ints + + +# Returns format used by Struct, assuming we are interested in integers (so 4 bytes) +def get_format(byteorder: Union[Literal["little"], Literal["big"]], input_list): + frmt = "" + if byteorder == "little": + frmt += "<" + elif byteorder == "big": + frmt += ">" + else: + raise ValueError("byteorder must be 'little' or 'big'.") + + if type(input_list) is bytes: + assert len(input_list) % 4 == 0, "input_list length not divisble by 4." + frmt += f"{len(input_list)//4}" + elif type(input_list[0]) is int: + frmt += f"{len(input_list)}" + + frmt += "I" + return frmt + + +# Takes in top level cocotb memory structure and returns integers of bytes contained in it. +def cocotb_mem_to_ints(memory) -> List[int]: + integers = list(map(lambda e: e.integer, memory.mem.value)) + # Cocotb mem.value seems to store integers in reverse order? So memory cell 0 is + # at index -1 and memory cell n-1 is at index 0 + return integers[::-1]