diff --git a/.gitignore b/.gitignore index ebc50ae..7c8a8fa 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ synth-ips.log build-hw.log profile-ips.log magia_venv/ +modelsim.ini +# sw/tests/*/ diff --git a/Bender.local b/Bender.local index c7a6a13..c9fd587 100644 --- a/Bender.local +++ b/Bender.local @@ -1,9 +1,10 @@ overrides: fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 } hci : { git: "https://github.com/pulp-platform/hci.git" , rev: 5a48a854573fca5bbabc1cfd4110fa4530a50ed7 } - cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: 1a93f340e9dadb9f7c8c471f27a40932c8b1c62e } + cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: 37a82d337ba60129c333d104c29e816d0698b53b } cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } axi : { git: "https://github.com/pulp-platform/axi.git" , version: 0.39.5 } + obi : { git: "https://github.com/pulp-platform/obi.git" , rev: 528dc65303d5ffb02fbc254324c6b53eac0dd6e5 } register_interface : { git: "https://github.com/pulp-platform/register_interface.git", rev: e25b36670ff7aab3402f40efcc2b11ee0f31cf19 } idma : { git: "https://github.com/pulp-platform/iDMA.git" , rev: c12caf59bb482fe44b27361f6924ad346b2d22fe } tech_cells_generic : { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.13 } diff --git a/Bender.yml b/Bender.yml index 33fb6e3..51a1585 100644 --- a/Bender.yml +++ b/Bender.yml @@ -13,6 +13,7 @@ # limitations under the License. # # Author: Victor Isachi +# Luca Balboni # # Bender manifest @@ -20,10 +21,12 @@ package: name: magia authors: - "Victor Isachi (victor.isachi@unibo.it)" + - "Luca Balboni (luca.balboni10@studio.unibo.it)" dependencies: redmule : { git: "https://github.com/pulp-platform/redmule.git" , rev: 9a1aa14be0b23f0ade84bab57e7e434397ac9876 } # branch: vi/scale_up cv32e40x : { git: "https://github.com/pulp-platform/cv32e40x.git" , rev: a90101211048ba1a16cedbe4db963ab6e12569d7 } # branch: vi/redmule_scaleup + cv32e40p : { git: "https://github.com/pulp-platform/cv32e40p.git" , rev: 37a82d337ba60129c333d104c29e816d0698b53b } idma : { git: "https://github.com/pulp-platform/iDMA.git" , rev: a6b190c7991331432afa9a2899d032bc1b176830 } # branch: vi/redmule_scaleup hwpe-stream : { git: "https://github.com/pulp-platform/hwpe-stream.git" , version: 1.6 } hwpe-ctrl : { git: "https://github.com/pulp-platform/hwpe-ctrl.git" , rev: c35d5b0886ab549fb9144c3c14a4682112330e21 } # branch: yt/reqrsp @@ -31,14 +34,15 @@ dependencies: cluster_icache : { git: "https://github.com/pulp-platform/cluster_icache.git" , rev: 917ecbf908bdaa22c5713bbcff277d142506bb16 } # branch: michaero/astral fpnew : { git: "https://github.com/pulp-platform/cvfpu.git" , rev: "pulp-v0.1.3" } fpu_ss : { git: "https://github.com/pulp-platform/fpu_ss.git" , rev: 8e2eff774d9d38a1e17a46bd56a0936dac9522f0 } # branch: vi/bender_manifest - obi : { git: "https://github.com/pulp-platform/obi.git" , version: 0.1.6 } + obi : { git: "https://github.com/pulp-platform/obi.git" , rev: 528dc65303d5ffb02fbc254324c6b53eac0dd6e5 } # branch: lb/fix_atop_resolver axi : { git: "https://github.com/pulp-platform/axi.git" , version: 0.39.5 } register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.4.4 } - safety_island : { git: "https://github.com/pulp-platform/safety_island.git" , rev: 2273db6c780ab7c582feaf0c9645ad644c35aa11 } # branch: vi/redmule_scaleup + axi_obi : { git: "https://github.com/pulp-platform/axi_obi.git" , rev: 3f29bd67369a093cf5be4ea3fdac3c6c216424cc } # branch: vi/magia common_cells : { git: "https://github.com/pulp-platform/common_cells.git" , version: 1.21.0 } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } fractal_sync : { git: "https://github.com/VictorIsachi/fractal_sync" , rev: fdb619f40f99d769cfceb20ac2117ff8d99e98a3 } # branch: main floo_noc : { git: "https://github.com/pulp-platform/FlooNoC.git" , rev: f4a36265cda8b56faee45692afb20ddfffba6dee } # branch: main + event_unit_flex : { git: "https://github.com/pulp-platform/event_unit_flex.git" , rev: 763c3b9977970f656326c70a96debfb2ac0f85b2 } export_include_dirs: - hw/include @@ -64,13 +68,21 @@ sources: - hw/tile/converters/obi2hci.sv - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv + - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/xbar_periph_bus_if.sv + - hw/tile/cluster_event_map.sv + - hw/tile/magia_event_unit.sv - hw/tile/obi_demux_addr.sv - hw/tile/l1_spm.sv - - hw/tile/idma_xif_inst_decoder.sv - hw/tile/xif_inst_dispatcher.sv - - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_xif_inst_decoder.sv - hw/tile/idma_ctrl.sv + - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_obi_ctrl_decoder.sv + - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv + - hw/tile/obi_slave_fsync.sv + - hw/tile/core_data_demux_eu_direct.sv - hw/tile/magia_tile.sv # MAGIA DV - target/sim/src/tile/magia_tile_tb_pkg.sv @@ -79,7 +91,8 @@ sources: - target/sim/src/tile/magia_tile_vip.sv - target/sim/src/tile/magia_tile_fixture.sv - target/sim/src/tile/magia_tile_tb.sv - # MAGIA + # MAGIA + - target: all(magia_dv, not(standalone_tile)) defines: CORE_TRACES: ~ @@ -104,13 +117,21 @@ sources: - hw/tile/converters/obi2hci.sv - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv + - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/xbar_periph_bus_if.sv + - hw/tile/cluster_event_map.sv + - hw/tile/magia_event_unit.sv - hw/tile/obi_demux_addr.sv - hw/tile/l1_spm.sv - - hw/tile/idma_xif_inst_decoder.sv - hw/tile/xif_inst_dispatcher.sv - - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_xif_inst_decoder.sv - hw/tile/idma_ctrl.sv + - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_obi_ctrl_decoder.sv + - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv + - hw/tile/obi_slave_fsync.sv + - hw/tile/core_data_demux_eu_direct.sv - hw/tile/magia_tile.sv # MAGIA - hw/mesh/magia.sv @@ -144,13 +165,21 @@ sources: - hw/tile/converters/obi2hci.sv - hw/tile/converters/hci2obi.sv - hw/tile/converters/xif_if2struct.sv + - hw/tile/converters/obi2hwpe_ctrl.sv + - hw/tile/xbar_periph_bus_if.sv + - hw/tile/cluster_event_map.sv + - hw/tile/magia_event_unit.sv - hw/tile/obi_demux_addr.sv - hw/tile/l1_spm.sv - - hw/tile/idma_xif_inst_decoder.sv - hw/tile/xif_inst_dispatcher.sv - - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_xif_inst_decoder.sv - hw/tile/idma_ctrl.sv + - hw/tile/idma_axi_obi_transfer_ch.sv + - hw/tile/idma_obi_ctrl_decoder.sv + - hw/tile/idma_ctrl_mm.sv - hw/tile/fractal_sync_xif_inst_decoder.sv + - hw/tile/obi_slave_fsync.sv + - hw/tile/core_data_demux_eu_direct.sv - hw/tile/magia_tile.sv # MAGIA - hw/mesh/noc/floo_axi_mesh_2x2_noc.sv diff --git a/Makefile b/Makefile index 6fa411f..be433dc 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,11 @@ # Paths to folders +ROOT_DIR := $(abspath $(dir $(lastword $(MAKEFILE_LIST)))) +core ?= CV32E40X + MAGIA_DIR ?= $(shell pwd) + SW ?= sw BUILD_DIR ?= sim/work ifneq (,$(wildcard /etc/iis.version)) @@ -34,7 +38,11 @@ BENDER_DIR ?= . ISA ?= riscv ARCH ?= rv XLEN ?= 32 -XTEN ?= imafc +ifeq ($(core), CV32E40X) + XTEN = imafc +else + XTEN = imfcxpulpv2 +endif ABI ?= ilp XABI ?= f @@ -86,6 +94,10 @@ ifeq ($(debug),1) FLAGS += -DDEBUG endif +ifeq ($(core), CV32E40X) + FLAGS += -DCV32E40X +endif + # Include directories INC += -Isw INC += -Isw/inc @@ -118,8 +130,8 @@ $(STIM_INSTR) $(STIM_DATA): $(BIN) scripts/parse_s19.pl $(BIN).s19 > $(BIN).txt && \ python scripts/s19tomem.py $(BIN).txt $(STIM_INSTR) $(STIM_DATA) cd $(TEST_DIR)/$(test) && \ - ln -sfn ../../../$(INI_PATH) $(VSIM_INI) && \ - ln -sfn ../../../$(WORK_PATH) $(VSIM_LIBS) + ln -sfn $(ROOT_DIR)/$(INI_PATH) $(VSIM_INI) && \ + ln -sfn $(ROOT_DIR)/$(WORK_PATH) $(VSIM_LIBS) $(BIN): $(CRT) $(OBJ) $(LD) $(LD_OPTS) -o $(BIN) $(CRT) $(OBJ) -T$(LINKSCRIPT) @@ -203,11 +215,23 @@ include bender_sim.mk include bender_synth.mk include bender_profile.mk -bender_defs += -D COREV_ASSERT_OFF +ifeq ($(core), CV32E40X) + bender_defs += -D COREV_ASSERT_OFF +endif + +ifeq ($(core), CV32E40X) + bender_defs += -D CV32E40X +else ifeq ($(core), CV32E40P) + bender_defs += -D CV32E40P +else + $(error Detected unsupported core, must choose among CV32E40X and CV32E40P) +endif bender_targs += -t rtl bender_targs += -t test -bender_targs += -t cv32e40p_exclude_tracer +bender_targs += -t cv32e40p_include_tracer + + # Targets needed to avoid error even though the module is not used bender_targs += -t snitch_cluster bender_targs += -t idma_test @@ -227,9 +251,13 @@ ifeq ($(mesh_dv),1) else tb := magia_tile_tb endif -WAVES := ./wave.do -bender_targs += -t redmule_complex -bender_targs += -t cv32e40x_bhv +WAVES := $(mkfile_path)/wave.do +ifeq ($(core), CV32E40X) + bender_targs += -t redmule_complex + bender_targs += -t cv32e40x_bhv +else + bender_targs += -t redmule_hwpe +endif update-ips: $(BENDER) update diff --git a/README.md b/README.md index db67375..a11fc96 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ By default, the `python` in your `$PATH` is used. You can specify the version by The following *optional* parameters can be specified: +`core`: **CV32E40X**|**CV32E40P** (**Default**: CV32E40X). Selects between the cv32e40x core with Xif programming interface and cv32e40p with memory mapped interface. + `mesh_dv`: **0**|**1** (**Default**: 1). 0 simulation of a single tile; 1 simulation of the entire mesh. `fast_sim`: **0**|**1** (**Default**: 0). 0 faster simulation that does not track signals; 1 simulation that tracks signals (for debugging). @@ -36,7 +38,7 @@ The following *optional* parameters can be specified: **1)** Setup the *environment* (`MAGIA` folder): ```bash -source setup_env.sh +source setup_env.sh ``` **2)** Install *python dependencies* (`MAGIA` folder): ```bash @@ -48,7 +50,7 @@ make bender ``` **4)** Clone the *dependencies* and generate the *compilation script* (`MAGIA` folder): ```bash -make update-ips > update-ips.log +make update-ips > update-ips.log ``` **4\*)** Apply FlooNoC *patch* - **currently FlooNoC requires this step but should not need it in the future** (`MAGIA` folder): ```bash @@ -56,27 +58,27 @@ make floonoc-patch ``` **5)** *Build* the hardware (`MAGIA` folder): ```bash -make build-hw > build-hw.log +make build-hw > build-hw.log ``` **6)** *Compile* the test code (`MAGIA` folder): ```bash -make all +make all ``` **7)** *Run* test (`MAGIA` folder): ```bash -make run +make run ``` **Full example**: ```bash make python_venv -source setup_env.sh +source setup_env.sh CV32E40P make python_deps make bender -make update-ips > update-ips.log -make build-hw > build-hw.log fast_sim=1 -make all test=fsync_test -make run test=fsync_test +make update-ips > update-ips.log core=CV32E40P +make build-hw > build-hw.log core=CV32E40P fast_sim=1 +make all test=fsync_test core=CV32E40P +make run test=fsync_test core=CV32E40P ``` ## ⚙️ Architecture diff --git a/bender_common.mk b/bender_common.mk index f40fda7..cff7514 100644 --- a/bender_common.mk +++ b/bender_common.mk @@ -22,6 +22,12 @@ common_targs += -t cv32e40p_exclude_tracer # common_targs += -t redmule_hwpe #endif +ifeq ($(core), CV32E40X) + sim_targs += -t cv32e40x +else ifeq ($(core), CV32E40P) + sim_targs += -t cv32e40p +endif + common_targs += -t magia_tile -common_defs += -D COREV_ASSERT_OFF \ No newline at end of file +common_defs += -D COREV_ASSERT_OFF diff --git a/hw/mesh/magia.sv b/hw/mesh/magia.sv index ac88c10..7dac58b 100644 --- a/hw/mesh/magia.sv +++ b/hw/mesh/magia.sv @@ -225,8 +225,11 @@ module magia .wu_wfe_i ); `ifdef CORE_TRACES + `ifdef CV32E40X localparam string core_trace_file_name = $sformatf("%s%0d", "log_file_", i*N_TILES_X+j); defparam i_magia_tile.i_cv32e40x_core.rvfi_i.tracer_i.LOGFILE_PATH_PLUSARG = core_trace_file_name; + `endif + // Note: cv32e40p tracer generates its own filename: trace_core_{cluster_id}_{core_id}.log `endif if (i == 0) begin diff --git a/hw/tile/cluster_event_map.sv b/hw/tile/cluster_event_map.sv new file mode 100644 index 0000000..c3792c2 --- /dev/null +++ b/hw/tile/cluster_event_map.sv @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * Simple cluster event mapping module for MAGIA project + * This module maps various event types to the final cluster event outputs +*/ + +module cluster_event_map #( + parameter int unsigned NB_CORES = 1 +)( + // Input events from various sources + input logic [NB_CORES-1:0] [7:0] sw_events_i, // Software events + input logic [NB_CORES-1:0] barrier_events_i, // Barrier events (reduced) + input logic [NB_CORES-1:0] mutex_events_i, // Mutex events (reduced) + input logic [NB_CORES-1:0] dispatch_events_i, // Dispatch events + input logic periph_fifo_event_i, // Peripheral FIFO event + + // Hardware events from accelerators, DMA, timers, etc. + input logic [NB_CORES-1:0] [3:0] acc_events_i, // Accelerator events (4 bits per core) + input logic [NB_CORES-1:0] [1:0] dma_events_i, // DMA events (2 bits per core) + input logic [NB_CORES-1:0] [1:0] timer_events_i, // Timer events (2 bits per core) + input logic [NB_CORES-1:0][31:0] cluster_events_i, // Custom cluster events (32 bits per core) + + // Output: mapped events for each core + output logic [NB_CORES-1:0][31:0] events_mapped_o +); + + // Simple event mapping for each core + for (genvar i = 0; i < NB_CORES; i++) begin : gen_event_mapping + assign events_mapped_o[i] = { + cluster_events_i[i][31:16], // [31:16] Custom cluster events (upper 16 bits) + 4'b0, // [15:12] Reserved + acc_events_i[i], // [11:8] Accelerator events + 2'b0, // [7:6] Reserved + timer_events_i[i], // [5:4] Timer events + dma_events_i[i], // [3:2] DMA events + dispatch_events_i[i], // [1] Dispatch event + barrier_events_i[i] | mutex_events_i[i] | periph_fifo_event_i // [0] Combined sync/periph events + }; + end + +endmodule : cluster_event_map \ No newline at end of file diff --git a/hw/tile/converters/data2obi.sv b/hw/tile/converters/data2obi.sv index 3c515d6..e6146ee 100644 --- a/hw/tile/converters/data2obi.sv +++ b/hw/tile/converters/data2obi.sv @@ -34,11 +34,19 @@ module data2obi_req assign obi_req_o.a.aid = 'b0; assign obi_req_o.a.a_optional.auser = 'b0; assign obi_req_o.a.a_optional.wuser = 'b0; + + assign obi_req_o.a.a_optional.mid = 'b0; + assign obi_req_o.a.a_optional.achk = 'b0; +`ifdef CV32E40X assign obi_req_o.a.a_optional.atop = data_req_i.atop; assign obi_req_o.a.a_optional.memtype = data_req_i.memtype; - assign obi_req_o.a.a_optional.mid = 'b0; assign obi_req_o.a.a_optional.prot = data_req_i.prot; assign obi_req_o.a.a_optional.dbg = data_req_i.dbg; - assign obi_req_o.a.a_optional.achk = 'b0; +`else + assign obi_req_o.a.a_optional.atop = 'b0; + assign obi_req_o.a.a_optional.memtype = 'b0; + assign obi_req_o.a.a_optional.prot = 'b0; + assign obi_req_o.a.a_optional.dbg = 'b0; +`endif endmodule: data2obi_req diff --git a/hw/tile/converters/obi2data.sv b/hw/tile/converters/obi2data.sv index 3b5e705..70fa64e 100644 --- a/hw/tile/converters/obi2data.sv +++ b/hw/tile/converters/obi2data.sv @@ -30,6 +30,8 @@ module obi2data_rsp assign data_rsp_o.rvalid = obi_rsp_i.rvalid; assign data_rsp_o.rdata = obi_rsp_i.r.rdata; assign data_rsp_o.err = obi_rsp_i.r.err; +`ifdef CV32E40X assign data_rsp_o.exokay = obi_rsp_i.r.r_optional.exokay; +`endif endmodule: obi2data_rsp \ No newline at end of file diff --git a/hw/tile/converters/obi2hwpe_ctrl.sv b/hw/tile/converters/obi2hwpe_ctrl.sv new file mode 100644 index 0000000..21c35e3 --- /dev/null +++ b/hw/tile/converters/obi2hwpe_ctrl.sv @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * + * OBI to iDMA Bridge - Memory-mapped control interface for iDMA + * + */ + +module obi2hwpe_ctrl + import magia_tile_pkg::*; +( + //OBI side + input core_obi_data_req_t obi_req_i, + output core_obi_data_rsp_t obi_rsp_o, + + //HWPE-ctrl (RedMulE) side + output redmule_ctrl_req_t ctrl_req_o, + input redmule_ctrl_rsp_t ctrl_rsp_i +); + + // ------------------------ + // Address channel mapping + // ------------------------ + assign ctrl_req_o.req = obi_req_i.req; + assign obi_rsp_o.gnt = ctrl_rsp_i.gnt; // handshake 1:1 + + assign ctrl_req_o.add = obi_req_i.a.addr; + assign ctrl_req_o.data = obi_req_i.a.wdata; + assign ctrl_req_o.be = obi_req_i.a.be; + assign ctrl_req_o.wen = ~obi_req_i.a.we; // inversione semantica + assign ctrl_req_o.id = '0; // OBI doesn't have ID in this config + + // ------------------------ + // Response channel mapping + // ------------------------ + assign obi_rsp_o.rvalid = ctrl_rsp_i.r_valid; + + assign obi_rsp_o.r.rdata = ctrl_rsp_i.r_data; + assign obi_rsp_o.r.err = 1'b0; // RedMulE ctrl no errors + + +endmodule + diff --git a/hw/tile/core_data_demux_eu_direct.sv b/hw/tile/core_data_demux_eu_direct.sv new file mode 100644 index 0000000..76ba706 --- /dev/null +++ b/hw/tile/core_data_demux_eu_direct.sv @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * Victor Isachi + * + * Core Data Demux EU Direct Link + * + * This module implements a demux that splits core data requests between: + * - Regular crossbar for general memory/peripheral access + * - EU direct link for low-latency Event Unit access (WFE control) + * + * The demux decision is based on address range: + * - EVENT_UNIT_ADDR_START to EVENT_UNIT_ADDR_END -> EU direct link + * - All other addresses -> Regular crossbar + * + */ + +module core_data_demux_eu_direct + import magia_tile_pkg::*; + import magia_pkg::*; +#( + parameter logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START = magia_tile_pkg::EVENT_UNIT_ADDR_START, + parameter logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_END = magia_tile_pkg::EVENT_UNIT_ADDR_END +)( + input logic clk_i, + input logic rst_ni, + + // Core data interface (input from cv32e40p) + input magia_tile_pkg::core_data_req_t core_data_req_i, + output magia_tile_pkg::core_data_rsp_t core_data_rsp_o, + + // Regular crossbar interface (for general memory/peripheral access) + output magia_tile_pkg::core_data_req_t xbar_data_req_o, + input magia_tile_pkg::core_data_rsp_t xbar_data_rsp_i, + + // EU direct link interface (abstract types) + output magia_tile_pkg::eu_direct_req_t eu_direct_req_o, + input magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp_i +); + + enum logic {XBAR, EU} request_destination, request_destination_next; + + // Address range detection for EU direct access (pure combinatorial) + logic use_eu_direct; + logic request_granted; + + assign use_eu_direct = core_data_req_i.req && + (core_data_req_i.addr >= EVENT_UNIT_ADDR_START) && + (core_data_req_i.addr <= EVENT_UNIT_ADDR_END); + + // Grant occurs when request is accepted by the selected path + assign request_granted = core_data_req_i.req && core_data_rsp_o.gnt; + + // Determine next destination when a request is granted + assign request_destination_next = use_eu_direct ? EU : XBAR; + + // Update response destination based on GRANTED request + always_ff @(posedge clk_i, negedge rst_ni) begin : _UPDATE_RESPONSE_DESTINATION_ + if (!rst_ni) begin + request_destination <= XBAR; + end else begin + if (request_granted) begin + request_destination <= request_destination_next; + end + end + end + + // To regular crossbar + assign xbar_data_req_o.req = core_data_req_i.req && !use_eu_direct; + assign xbar_data_req_o.addr = core_data_req_i.addr; + assign xbar_data_req_o.be = core_data_req_i.be; + assign xbar_data_req_o.wdata = core_data_req_i.wdata; + assign xbar_data_req_o.we = core_data_req_i.we; +`ifdef CV32E40X + assign xbar_data_req_o.atop = core_data_req_i.atop; + assign xbar_data_req_o.memtype = core_data_req_i.memtype; + assign xbar_data_req_o.prot = core_data_req_i.prot; + assign xbar_data_req_o.dbg = core_data_req_i.dbg; +`endif + + // To EU direct link (abstract interface) + // Pass relative offset to Event Unit (subtract base address) + // Event Unit expects offset within its address space [9:0], not absolute address + assign eu_direct_req_o.req = core_data_req_i.req && use_eu_direct; + assign eu_direct_req_o.addr = core_data_req_i.addr - EVENT_UNIT_ADDR_START; + assign eu_direct_req_o.wen = ~core_data_req_i.we; // EU expects wen (write enable negated) + assign eu_direct_req_o.wdata = core_data_req_i.wdata; + assign eu_direct_req_o.be = core_data_req_i.be; + + // Response routing - uses stored destination + always_comb begin : _HANDLE_RESP_ + case (request_destination) + XBAR: begin + core_data_rsp_o.rvalid = xbar_data_rsp_i.rvalid; + core_data_rsp_o.rdata = xbar_data_rsp_i.rdata; + core_data_rsp_o.err = xbar_data_rsp_i.err; +`ifdef CV32E40X + core_data_rsp_o.exokay = xbar_data_rsp_i.exokay; +`endif + end + EU: begin + core_data_rsp_o.rvalid = eu_direct_rsp_i.rvalid; + core_data_rsp_o.rdata = eu_direct_rsp_i.rdata; + core_data_rsp_o.err = eu_direct_rsp_i.err; +`ifdef CV32E40X + core_data_rsp_o.exokay = '0; +`endif + end + endcase + end + + // GNT is combinatorial + assign core_data_rsp_o.gnt = use_eu_direct ? eu_direct_rsp_i.gnt : xbar_data_rsp_i.gnt; + +endmodule \ No newline at end of file diff --git a/hw/tile/idma_axi_obi_transfer_ch.sv b/hw/tile/idma_axi_obi_transfer_ch.sv index 8e7a5b6..a51474e 100644 --- a/hw/tile/idma_axi_obi_transfer_ch.sv +++ b/hw/tile/idma_axi_obi_transfer_ch.sv @@ -45,7 +45,13 @@ module idma_axi_obi_transfer_ch input axi_rsp_t axi_rsp_i, output obi_req_t obi_req_o, - input obi_rsp_t obi_rsp_i + input obi_rsp_t obi_rsp_i, + + // IRQ-related outputs + output logic transfer_busy_o, + output logic transfer_start_o, + output logic transfer_done_o, + output logic transfer_error_o ); /*******************************************************/ @@ -321,5 +327,13 @@ module idma_axi_obi_transfer_ch /*******************************************************/ /** Back-end End **/ /*******************************************************/ +/** IRQ Signal Generation **/ +/*******************************************************/ + + // Generate IRQ signals from internal transfer state + assign transfer_busy_o = |busy; // Any busy indication from iDMA + assign transfer_start_o = issue_id; // Transfer started (ID issued) + assign transfer_done_o = retire_id; // Transfer completed (ID retired) + assign transfer_error_o = eh_req_valid; // Error handling request indicates error endmodule: idma_axi_obi_transfer_ch \ No newline at end of file diff --git a/hw/tile/idma_ctrl_mm.sv b/hw/tile/idma_ctrl_mm.sv new file mode 100644 index 0000000..a1a8293 --- /dev/null +++ b/hw/tile/idma_ctrl_mm.sv @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * Based on idma_ctrl by Victor Isachi + * + * iDMA Memory-Mapped Controller + * + * This module provides memory-mapped control interface for iDMA transfers + * with interrupt support. It wraps both AXI2OBI and OBI2AXI transfer channels + * along with the memory-mapped bridge, providing equivalent functionality + * to idma_ctrl but using memory-mapped register access instead of ISA extensions. + */ + +module idma_ctrl_mm + import magia_tile_pkg::*; + import idma_pkg::*; +#( + parameter int unsigned ERROR_CAP = 3, + parameter type obi_req_t = magia_tile_pkg::core_obi_data_req_t, + parameter type obi_rsp_t = magia_tile_pkg::core_obi_data_rsp_t, + parameter type idma_fe_reg_req_t = magia_tile_pkg::idma_fe_reg_req_t, + parameter type idma_fe_reg_rsp_t = magia_tile_pkg::idma_fe_reg_rsp_t, + parameter type axi_req_t = magia_tile_pkg::idma_axi_req_t, + parameter type axi_rsp_t = magia_tile_pkg::idma_axi_rsp_t, + parameter type idma_obi_req_t = magia_tile_pkg::idma_obi_req_t, + parameter type idma_obi_rsp_t = magia_tile_pkg::idma_obi_rsp_t +)( + input logic clk_i, + input logic rst_ni, + input logic test_en_i, + input logic clear_i, + + // OBI Slave Interface (CPU memory-mapped access) + input obi_req_t obi_req_i, + output obi_rsp_t obi_rsp_o, + + // AXI Master Interfaces (to L2 memory) + output axi_req_t axi_read_req_o, // AXI2OBI: L2 read + input axi_rsp_t axi_read_rsp_i, + output axi_req_t axi_write_req_o, // OBI2AXI: L2 write + input axi_rsp_t axi_write_rsp_i, + + // OBI Master Interfaces (to L1 memory) + output idma_obi_req_t obi_read_req_o, // OBI2AXI: L1 read + input idma_obi_rsp_t obi_read_rsp_i, + output idma_obi_req_t obi_write_req_o, // AXI2OBI: L1 write + input idma_obi_rsp_t obi_write_rsp_i, + + // Serialized IRQ outputs + output logic irq_a2o_busy_o, + output logic irq_a2o_start_o, + output logic irq_a2o_done_o, + output logic irq_a2o_error_o, + output logic irq_o2a_busy_o, + output logic irq_o2a_start_o, + output logic irq_o2a_done_o, + output logic irq_o2a_error_o +); + +/*******************************************************/ +/** Internal Signal Definitions Beginning **/ +/*******************************************************/ + + // Internal signals for transfer channel connections + idma_fe_reg_req_t idma_fe_reg_axi2obi_req; + idma_fe_reg_rsp_t idma_fe_reg_axi2obi_rsp; + idma_fe_reg_req_t idma_fe_reg_obi2axi_req; + idma_fe_reg_rsp_t idma_fe_reg_obi2axi_rsp; + + // Direct transfer channel IRQ signals (used for IRQ logic) + logic a2o_transfer_busy; + logic a2o_transfer_start; + logic a2o_transfer_done; + logic a2o_transfer_error; + logic o2a_transfer_busy; + logic o2a_transfer_start; + logic o2a_transfer_done; + logic o2a_transfer_error; + +/*******************************************************/ +/** Transfer Channels Instantiation **/ +/*******************************************************/ + + // AXI2OBI Transfer Channel (L2 to L1) + idma_axi_obi_transfer_ch #( + .CHANNEL_T ( magia_tile_pkg::AXI2OBI ), + .ERROR_CAP ( ERROR_CAP ), + .idma_fe_reg_req_t ( magia_tile_pkg::idma_fe_reg_req_t ), + .idma_fe_reg_rsp_t ( magia_tile_pkg::idma_fe_reg_rsp_t ), + .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), + .axi_rsp_t ( magia_tile_pkg::idma_axi_rsp_t ), + .obi_req_t ( magia_tile_pkg::idma_obi_req_t ), + .obi_rsp_t ( magia_tile_pkg::idma_obi_rsp_t ) + ) i_l2_to_l1_ch ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( test_en_i ), + .clear_i ( clear_i ), + .cfg_req_i ( idma_fe_reg_axi2obi_req ), + .cfg_rsp_o ( idma_fe_reg_axi2obi_rsp ), + .axi_req_o ( axi_read_req_o ), + .axi_rsp_i ( axi_read_rsp_i ), + .obi_req_o ( obi_write_req_o ), + .obi_rsp_i ( obi_write_rsp_i ), + .transfer_busy_o ( a2o_transfer_busy ), + .transfer_start_o ( a2o_transfer_start ), + .transfer_done_o ( a2o_transfer_done ), + .transfer_error_o ( a2o_transfer_error ) + ); + + // OBI2AXI Transfer Channel (L1 to L2) + idma_axi_obi_transfer_ch #( + .CHANNEL_T ( magia_tile_pkg::OBI2AXI ), + .ERROR_CAP ( ERROR_CAP ), + .idma_fe_reg_req_t ( magia_tile_pkg::idma_fe_reg_req_t ), + .idma_fe_reg_rsp_t ( magia_tile_pkg::idma_fe_reg_rsp_t ), + .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), + .axi_rsp_t ( magia_tile_pkg::idma_axi_rsp_t ), + .obi_req_t ( magia_tile_pkg::idma_obi_req_t ), + .obi_rsp_t ( magia_tile_pkg::idma_obi_rsp_t ) + ) i_l1_to_l2_ch ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( test_en_i ), + .clear_i ( clear_i ), + .cfg_req_i ( idma_fe_reg_obi2axi_req ), + .cfg_rsp_o ( idma_fe_reg_obi2axi_rsp ), + .axi_req_o ( axi_write_req_o ), + .axi_rsp_i ( axi_write_rsp_i ), + .obi_req_o ( obi_read_req_o ), + .obi_rsp_i ( obi_read_rsp_i ), + .transfer_busy_o ( o2a_transfer_busy ), + .transfer_start_o ( o2a_transfer_start ), + .transfer_done_o ( o2a_transfer_done ), + .transfer_error_o ( o2a_transfer_error ) + ); + +/*******************************************************/ +/** Memory-Mapped Bridge with IRQ Serialization **/ +/*******************************************************/ + + idma_obi_ctrl_decoder i_idma_obi_ctrl_decoder ( + .obi_req_i ( obi_req_i ), + .obi_rsp_o ( obi_rsp_o ), + + .idma_axi2obi_req_o ( idma_fe_reg_axi2obi_req ), + .idma_axi2obi_rsp_i ( idma_fe_reg_axi2obi_rsp ), + .idma_obi2axi_req_o ( idma_fe_reg_obi2axi_req ), + .idma_obi2axi_rsp_i ( idma_fe_reg_obi2axi_rsp ) + ); + + + // Clean IRQ pass-through logic - equivalent to idma_ctrl behavior + assign irq_a2o_start_o = a2o_transfer_start; + assign irq_a2o_busy_o = a2o_transfer_busy; + assign irq_a2o_done_o = a2o_transfer_done; + assign irq_a2o_error_o = a2o_transfer_error; + + assign irq_o2a_start_o = o2a_transfer_start; + assign irq_o2a_busy_o = o2a_transfer_busy; + assign irq_o2a_done_o = o2a_transfer_done; + assign irq_o2a_error_o = o2a_transfer_error; + +/*******************************************************/ +/** Simple IRQ Logic End **/ +/*******************************************************/ + +endmodule: idma_ctrl_mm \ No newline at end of file diff --git a/hw/tile/idma_obi_ctrl_decoder.sv b/hw/tile/idma_obi_ctrl_decoder.sv new file mode 100644 index 0000000..9939659 --- /dev/null +++ b/hw/tile/idma_obi_ctrl_decoder.sv @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this fendmodule : idma_obi_ctrl_decoderle except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * Based on idma_ctrl by Victor Isachi + * + * OBI to iDMA Bridge - Memory-mapped control interface for iDMA + * + */ + +module idma_obi_ctrl_decoder + import magia_tile_pkg::*; + import magia_pkg::*; + #( + parameter type obi_req_t = magia_tile_pkg::core_obi_data_req_t, + parameter type obi_rsp_t = magia_tile_pkg::core_obi_data_rsp_t, + parameter type idma_fe_reg_req_t = magia_tile_pkg::idma_fe_reg_req_t, + parameter type idma_fe_reg_rsp_t = magia_tile_pkg::idma_fe_reg_rsp_t +)( + // OBI Slave Interface (CPU access) + input obi_req_t obi_req_i, + output obi_rsp_t obi_rsp_o, + + // iDMA Register Frontend Interface + output idma_fe_reg_req_t idma_axi2obi_req_o, + input idma_fe_reg_rsp_t idma_axi2obi_rsp_i, + + output idma_fe_reg_req_t idma_obi2axi_req_o, + input idma_fe_reg_rsp_t idma_obi2axi_rsp_i +); + +/*******************************************************/ +/** Internal Signal Definitions Beginning **/ +/*******************************************************/ + + // Address decode parameters - use parametric base address and size + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_BASE_ADDR = magia_tile_pkg::IDMA_CTRL_ADDR_START; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_SIZE = magia_tile_pkg::IDMA_CTRL_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_END_ADDR = magia_tile_pkg::IDMA_CTRL_ADDR_END; + + localparam int unsigned ADDR_WIDTH = 32; + localparam int unsigned DIRECTION_OFFSET = 12'h200; // +0x200 for direction change + + // Register offset definitions based on official reg32_3d spec + localparam logic [11:0] IDMA_CONFIG_OFFSET = 12'h0; + + localparam logic [11:0] IDMA_STATUS_0_OFFSET = 12'h4; + localparam logic [11:0] IDMA_STATUS_1_OFFSET = 12'h8; + localparam logic [11:0] IDMA_STATUS_2_OFFSET = 12'hc; + localparam logic [11:0] IDMA_STATUS_3_OFFSET = 12'h10; + + localparam logic [11:0] IDMA_NEXT_ID_0_OFFSET = 12'h44; + localparam logic [11:0] IDMA_NEXT_ID_1_OFFSET = 12'h48; + + localparam logic [11:0] IDMA_DONE_ID_0_OFFSET = 12'h84; + + localparam logic [11:0] IDMA_DST_ADDR_LOW_OFFSET = 12'hd0; + localparam logic [11:0] IDMA_SRC_ADDR_LOW_OFFSET = 12'hd8; + localparam logic [11:0] IDMA_LENGTH_LOW_OFFSET = 12'he0; + localparam logic [11:0] IDMA_DST_STRIDE_2_LOW_OFFSET = 12'he8; + localparam logic [11:0] IDMA_SRC_STRIDE_2_LOW_OFFSET = 12'hf0; + localparam logic [11:0] IDMA_REPS_2_LOW_OFFSET = 12'hf8; + localparam logic [11:0] IDMA_DST_STRIDE_3_LOW_OFFSET = 12'h100; + localparam logic [11:0] IDMA_SRC_STRIDE_3_LOW_OFFSET = 12'h108; + localparam logic [11:0] IDMA_REPS_3_LOW_OFFSET = 12'h110; + + logic direction; // Direction of the iDMA channel: 0 -> AXI2OBI; 1 -> OBI2AXI + logic [11:0] reg_offset; + logic is_valid_access; + logic is_address_in_range; + + idma_fe_reg_req_t selected_idma_req; + idma_fe_reg_rsp_t selected_idma_rsp; + +/*******************************************************/ +/** Internal Signal Definitions End **/ +/*******************************************************/ +/** Address Decoding Beginning **/ +/*******************************************************/ + + // Address range validation - check if address is within iDMA control space + assign is_address_in_range = (obi_req_i.a.addr >= IDMA_BASE_ADDR) && + (obi_req_i.a.addr <= IDMA_END_ADDR); + + // Address decoding - check if address is in OBI2AXI range (+0x200 offset) + assign direction = (obi_req_i.a.addr >= (IDMA_BASE_ADDR + DIRECTION_OFFSET)) ? 1'b1 : 1'b0; + assign reg_offset = direction ? + (obi_req_i.a.addr[11:0] - IDMA_BASE_ADDR[11:0] - DIRECTION_OFFSET[11:0]) : + (obi_req_i.a.addr[11:0] - IDMA_BASE_ADDR[11:0]); + + // Validate access: must be in address range AND at known register offset + assign is_valid_access = is_address_in_range && ( + (reg_offset == IDMA_CONFIG_OFFSET) || + // Status registers (multireg 0x4-0x40, step 4) + ((reg_offset >= 12'h4) && (reg_offset <= 12'h40) && ((reg_offset & 12'h3) == 12'h0)) || + // Next ID registers (multireg 0x44-0x80, step 4) + ((reg_offset >= 12'h44) && (reg_offset <= 12'h80) && ((reg_offset & 12'h3) == 12'h0)) || + // Done ID registers (multireg 0x84-0xc0, step 4) + ((reg_offset >= 12'h84) && (reg_offset <= 12'hc0) && ((reg_offset & 12'h3) == 12'h0)) || + // Configuration registers at specific skipto addresses + (reg_offset == IDMA_DST_ADDR_LOW_OFFSET) || + (reg_offset == IDMA_SRC_ADDR_LOW_OFFSET) || + (reg_offset == IDMA_LENGTH_LOW_OFFSET) || + (reg_offset == IDMA_DST_STRIDE_2_LOW_OFFSET) || + (reg_offset == IDMA_SRC_STRIDE_2_LOW_OFFSET) || + (reg_offset == IDMA_REPS_2_LOW_OFFSET) || + (reg_offset == IDMA_DST_STRIDE_3_LOW_OFFSET) || + (reg_offset == IDMA_SRC_STRIDE_3_LOW_OFFSET) || + (reg_offset == IDMA_REPS_3_LOW_OFFSET) + ); + +/*******************************************************/ +/** Address Decoding End **/ +/*******************************************************/ +/** Channel Selection Beginning **/ +/*******************************************************/ + + // Channel demultiplexer + always_comb begin: channel_demux + // Default assignments + idma_axi2obi_req_o = '0; + idma_obi2axi_req_o = '0; + selected_idma_rsp = '0; + + if (is_valid_access && obi_req_i.req) begin + if (direction) begin // OBI2AXI channel (L1->L2) + idma_obi2axi_req_o = selected_idma_req; + selected_idma_rsp = idma_obi2axi_rsp_i; + end else begin // AXI2OBI channel (L2->L1) + idma_axi2obi_req_o = selected_idma_req; + selected_idma_rsp = idma_axi2obi_rsp_i; + end + end + end + +/*******************************************************/ +/** Channel Selection End **/ +/*******************************************************/ +/** OBI Protocol Handling Beginning **/ +/*******************************************************/ + + // Convert OBI transaction to iDMA register access + always_comb begin: obi_to_idma_reg + selected_idma_req.addr = {20'h0, reg_offset}; // Use only offset for iDMA frontend + selected_idma_req.write = obi_req_i.a.we; + selected_idma_req.wdata = obi_req_i.a.wdata; + selected_idma_req.wstrb = obi_req_i.a.be; + selected_idma_req.valid = obi_req_i.req && is_valid_access; + end + + // OBI response - purely combinatorial like XIF interface + always_comb begin: idma_reg_to_obi + // Grant immediately for valid requests (OBI protocol requirement) + obi_rsp_o.gnt = obi_req_i.req && is_valid_access; + + // Response valid when iDMA is ready to respond (both read and write) + obi_rsp_o.rvalid = selected_idma_rsp.ready && is_valid_access; + + // Read data directly from iDMA response (writes return 0) + obi_rsp_o.r.rdata = selected_idma_rsp.rdata; + obi_rsp_o.r.r_optional = '0; + obi_rsp_o.r.err = selected_idma_rsp.error || !is_valid_access; + obi_rsp_o.r.rid = '0; + end + +/*******************************************************/ +/** OBI Protocol Handling End **/ +/*******************************************************/ +/** Debug Display Statements **/ +/*******************************************************/ + + +endmodule: idma_obi_ctrl_decoder \ No newline at end of file diff --git a/hw/tile/magia_event_unit.sv b/hw/tile/magia_event_unit.sv new file mode 100644 index 0000000..32050a2 --- /dev/null +++ b/hw/tile/magia_event_unit.sv @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * + * Wrapper module for MAGIA Event Unit optimized for single-core systems +*/ + +module magia_event_unit +import magia_tile_pkg::*; +#( + // MAGIA Event Unit Parameters - Optimized for single-core system + parameter int unsigned NB_CORES = 1, // Single core system + parameter int unsigned NB_SW_EVT = 1, // Minimal SW events for basic functionality + parameter int unsigned NB_BARR = 0, // Barrier units disabled (no sync needed) + parameter int unsigned NB_HW_MUT = 0, // Hardware mutexes disabled (no contention) + parameter int unsigned MUTEX_MSG_W = 32, // Mutex message width (unused but kept for compatibility) + parameter int unsigned DISP_FIFO_DEPTH = 0, // Task dispatcher disabled (no distribution) + parameter int unsigned EVNT_WIDTH = 8, // SOC event width (external events) + parameter int unsigned SOC_FIFO_DEPTH = 8 // SOC event FIFO depth (external events) +) +( + // clock and reset + input logic clk_i, + input logic rst_ni, + input logic test_mode_i, + + // Event inputs (from accelerators, DMA, etc.) + input logic [NB_CORES-1:0] [3:0] acc_events_i, + input logic [NB_CORES-1:0] [1:0] dma_events_i, + input logic [NB_CORES-1:0] [1:0] timer_events_i, + input logic [NB_CORES-1:0][31:0] other_events_i, + + // Core IRQ interface (both directions needed for proper operation) + output logic [NB_CORES-1:0] core_irq_req_o, + output logic [NB_CORES-1:0] [4:0] core_irq_id_o, + input logic [NB_CORES-1:0] core_irq_ack_i, + input logic [NB_CORES-1:0] [4:0] core_irq_ack_id_i, + + // Core control interface + input logic [NB_CORES-1:0] core_busy_i, + output logic [NB_CORES-1:0] core_clock_en_o, + + // Debug interface (bidirectional) + input logic [NB_CORES-1:0] dbg_req_i, + output logic [NB_CORES-1:0] core_dbg_req_o, + + // EU Direct Link interface + input logic eu_direct_req_i, + input logic [31:0] eu_direct_addr_i, + input logic eu_direct_wen_i, + input logic [31:0] eu_direct_wdata_i, + input logic [3:0] eu_direct_be_i, + output logic eu_direct_gnt_o, + output logic eu_direct_rvalid_o, + output logic [31:0] eu_direct_rdata_o, + output logic eu_direct_err_o +); + + // Create internal interface instances + XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) eu_direct_link[NB_CORES-1:0](); + XBAR_PERIPH_BUS #(.ID_WIDTH(NB_CORES+1)) speriph_slave(); // Tied off + + // Internal signals + logic soc_periph_evt_ready_internal; + + // Convert abstract eu_direct interface to XBAR_PERIPH_BUS + // eu_direct_addr_i already contains relative offset (subtracted by demux) + assign eu_direct_link[0].req = eu_direct_req_i; + assign eu_direct_link[0].add = eu_direct_addr_i; + assign eu_direct_link[0].wen = eu_direct_wen_i; + assign eu_direct_link[0].wdata = eu_direct_wdata_i; + assign eu_direct_link[0].be = eu_direct_be_i; + assign eu_direct_link[0].id = '0; + + // Convert XBAR_PERIPH_BUS response to abstract interface + // Event Unit handles all power management and grant logic internally + assign eu_direct_gnt_o = eu_direct_link[0].gnt; + assign eu_direct_rvalid_o = eu_direct_link[0].r_valid; + assign eu_direct_rdata_o = eu_direct_link[0].r_rdata; + assign eu_direct_err_o = eu_direct_link[0].r_opc; // r_opc: 0=OK, 1=ERROR + + // Tie off speriph_slave (not used anymore) + assign speriph_slave.req = 1'b0; + assign speriph_slave.add = '0; + assign speriph_slave.wen = 1'b1; + assign speriph_slave.wdata = '0; + assign speriph_slave.be = '0; + assign speriph_slave.id = '0; + + + + // Event Unit Flex instantiation + event_unit_top #( + .NB_CORES ( NB_CORES ), + .NB_SW_EVT ( NB_SW_EVT ), + .NB_BARR ( NB_BARR ), + .NB_HW_MUT ( NB_HW_MUT ), + .MUTEX_MSG_W ( MUTEX_MSG_W ), + .DISP_FIFO_DEPTH ( DISP_FIFO_DEPTH ), + .PER_ID_WIDTH ( NB_CORES+1 ), + .EVNT_WIDTH ( EVNT_WIDTH ), + .SOC_FIFO_DEPTH ( SOC_FIFO_DEPTH ) + ) i_event_unit_top ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_mode_i ( test_mode_i ), + .acc_events_i ( acc_events_i ), + .dma_events_i ( dma_events_i ), + .timer_events_i ( timer_events_i ), + .cluster_events_i ( other_events_i ), + .core_irq_req_o ( core_irq_req_o ), + .core_irq_id_o ( core_irq_id_o ), + .core_irq_ack_i ( core_irq_ack_i ), + .core_irq_ack_id_i ( core_irq_ack_id_i ), + .core_busy_i ( core_busy_i ), + .core_clock_en_o ( core_clock_en_o ), + .dbg_req_i ( dbg_req_i ), + .core_dbg_req_o ( core_dbg_req_o ), + .soc_periph_evt_valid_i ( 1'b0 ), + .soc_periph_evt_ready_o ( soc_periph_evt_ready_internal ), + .soc_periph_evt_data_i ( '0 ), + .speriph_slave ( speriph_slave.Slave ), + .eu_direct_link ( eu_direct_link ) + ); + +endmodule: magia_event_unit \ No newline at end of file diff --git a/hw/tile/magia_tile.sv b/hw/tile/magia_tile.sv index 822ada0..9ed4b0b 100644 --- a/hw/tile/magia_tile.sv +++ b/hw/tile/magia_tile.sv @@ -15,6 +15,7 @@ * SPDX-License-Identifier: SHL-0.51 * * Authors: Victor Isachi + * Luca Balboni * * MAGIA Tile */ @@ -27,18 +28,20 @@ module magia_tile import magia_pkg::*; import redmule_pkg::*; import hci_package::*; +`ifdef CV32E40X import cv32e40x_pkg::*; import fpu_ss_pkg::*; +`endif import snitch_icache_pkg::*; import idma_pkg::*; import obi_pkg::*; import axi_pkg::*; import floo_pkg::*; - `ifndef TARGET_STANDALONE_TILE +`ifndef TARGET_STANDALONE_TILE import magia_noc_pkg::*; - `else +`else import floo_axi_mesh_1x2_noc_pkg::*; - `endif +`endif #( // Parameters used by hci_interconnect and l1_spm parameter int unsigned N_MEM_BANKS = magia_pkg::N_MEM_BANKS, // Number of memory banks @@ -124,6 +127,14 @@ module magia_tile logic[magia_pkg::ADDR_W-1:0] tile_l1_end_addr; logic[magia_pkg::ADDR_W-1:0] tile_reserved_start_addr; logic[magia_pkg::ADDR_W-1:0] tile_reserved_end_addr; + logic[magia_pkg::ADDR_W-1:0] tile_redmule_ctrl_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_redmule_ctrl_end_addr; + logic[magia_pkg::ADDR_W-1:0] tile_idma_ctrl_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_idma_ctrl_end_addr; + logic[magia_pkg::ADDR_W-1:0] tile_fsync_ctrl_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_fsync_ctrl_end_addr; + logic[magia_pkg::ADDR_W-1:0] tile_event_unit_start_addr; + logic[magia_pkg::ADDR_W-1:0] tile_event_unit_end_addr; magia_tile_pkg::redmule_data_req_t redmule_data_req; magia_tile_pkg::redmule_data_rsp_t redmule_data_rsp; @@ -137,8 +148,8 @@ module magia_tile magia_tile_pkg::core_obi_data_req_t core_obi_data_req; magia_tile_pkg::core_obi_data_rsp_t core_obi_data_rsp; - magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req; // Index 0 -> L2, Index 1 -> L1SPM - magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp; // Index 0 -> L2, Index 1 -> L1SPM + magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_req; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl + magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_rsp; // cv32e40x: Index 0 -> L2, Index 1 -> L1SPM; cv32e40p: Index 2 -> RedMulE_ctrl, Index 3 -> iDMA_ctrl, Index 4 -> FSync_ctrl magia_tile_pkg::core_obi_data_req_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_req; // Index 0 -> L2, Index 1 -> L1SPM magia_tile_pkg::core_obi_data_rsp_t[magia_tile_pkg::N_SBR-1:0] core_mem_data_cut_rsp; // Index 0 -> L2, Index 1 -> L1SPM @@ -243,6 +254,13 @@ module magia_tile logic sys_clk; logic sys_clk_en; + // Core clock gating signals + logic core_clk; // Clock gated per il core + logic core_clk_en; // Enable dal tile (sempre attivo) + + // Core output signals + logic core_busy_o; + logic[magia_pkg::N_IRQ-1:0] irq; logic redmule_busy; logic[magia_tile_pkg::N_CORE-1:0][1:0] redmule_evt; @@ -266,6 +284,12 @@ module magia_tile logic fsync_done; logic fsync_error; + // Event arrays for Event Unit (need proper 2D array structure) + logic [0:0] [3:0] acc_events_array; + logic [0:0] [1:0] dma_events_array; + logic [0:0] [1:0] timer_events_array; + logic [0:0][31:0] other_events_array; + // FlooNoC connections between NI and router floo_req_t [4:0] floo_router_req_in; floo_rsp_t [4:0] floo_router_rsp_in; @@ -273,7 +297,7 @@ module magia_tile floo_rsp_t [4:0] floo_router_rsp_out; id_t floo_id; - + logic x_compressed_valid; logic x_compressed_ready; fpu_ss_pkg::x_compressed_req_t x_compressed_req; @@ -294,25 +318,52 @@ module magia_tile logic x_result_ready; fpu_ss_pkg::x_result_t x_result; + // Event Unit signals - Corrected for single-core array interface + logic [0:0] eu_core_irq_req; // [0:0] array for single core + logic [0:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_id; // [0:0][4:0] array + logic [0:0] eu_core_irq_ack; // [0:0] array + logic [0:0][magia_tile_pkg::EVENT_UNIT_IRQ_WIDTH-1:0] eu_core_irq_ack_id; // [0:0][4:0] array + logic [0:0] eu_core_clk_en; // [0:0] array + logic [0:0] eu_core_dbg_req; // [0:0] array + + // Core data demux signals + magia_tile_pkg::core_data_req_t core_data_req_to_xbar; + magia_tile_pkg::core_data_rsp_t core_data_rsp_from_xbar; + magia_tile_pkg::eu_direct_req_t eu_direct_req; + magia_tile_pkg::eu_direct_rsp_t eu_direct_rsp; + /*******************************************************/ /** Internal Signal Definitions End **/ /*******************************************************/ /** Hardwired Signals Beginning **/ /*******************************************************/ - assign tile_l1_start_addr = magia_tile_pkg::L1_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - assign tile_l1_end_addr = magia_tile_pkg::L1_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - assign tile_reserved_start_addr = magia_tile_pkg::RESERVED_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - assign tile_reserved_end_addr = magia_tile_pkg::RESERVED_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; - - assign obi_xbar_rule[magia_tile_pkg::L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; - assign obi_xbar_rule[magia_tile_pkg::L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; - assign obi_xbar_rule[magia_tile_pkg::RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; - assign obi_xbar_rule[magia_tile_pkg::STACK_IDX] = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END }; - - assign axi_xbar_rule[magia_tile_pkg::L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; - assign axi_xbar_rule[magia_tile_pkg::L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; - assign axi_xbar_rule[magia_tile_pkg::RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; + assign tile_redmule_ctrl_start_addr = magia_tile_pkg::REDMULE_CTRL_ADDR_START; + assign tile_redmule_ctrl_end_addr = magia_tile_pkg::REDMULE_CTRL_ADDR_END; + assign tile_idma_ctrl_start_addr = magia_tile_pkg::IDMA_CTRL_ADDR_START; + assign tile_idma_ctrl_end_addr = magia_tile_pkg::IDMA_CTRL_ADDR_END; + assign tile_fsync_ctrl_start_addr = magia_tile_pkg::FSYNC_CTRL_ADDR_START; + assign tile_fsync_ctrl_end_addr = magia_tile_pkg::FSYNC_CTRL_ADDR_END; + assign tile_event_unit_start_addr = magia_tile_pkg::EVENT_UNIT_ADDR_START; + assign tile_event_unit_end_addr = magia_tile_pkg::EVENT_UNIT_ADDR_END; + assign tile_l1_start_addr = magia_tile_pkg::L1_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_l1_end_addr = magia_tile_pkg::L1_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_reserved_start_addr = magia_tile_pkg::RESERVED_ADDR_START + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + assign tile_reserved_end_addr = magia_tile_pkg::RESERVED_ADDR_END + mhartid_i*magia_tile_pkg::L1_TILE_OFFSET; + + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_STACK_IDX] = '{idx: 32'd1, start_addr: magia_tile_pkg::STACK_ADDR_START, end_addr: magia_tile_pkg::STACK_ADDR_END }; +`ifndef CV32E40X + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] = '{idx: 32'd2, start_addr: tile_redmule_ctrl_start_addr, end_addr: tile_redmule_ctrl_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_IDMA_IDX] = '{idx: 32'd3, start_addr: tile_idma_ctrl_start_addr, end_addr: tile_idma_ctrl_end_addr }; + assign obi_xbar_rule[magia_tile_pkg::OBI_XBAR_FSYNC_CTRL_IDX] = '{idx: 32'd4, start_addr: tile_fsync_ctrl_start_addr, end_addr: tile_fsync_ctrl_end_addr }; +`endif + + assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L2_IDX] = '{idx: 32'd0, start_addr: magia_tile_pkg::L2_ADDR_START, end_addr: magia_tile_pkg::L2_ADDR_END }; + assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_L1SPM_IDX] = '{idx: 32'd1, start_addr: tile_l1_start_addr, end_addr: tile_l1_end_addr }; + assign axi_xbar_rule[magia_tile_pkg::AXI_XBAR_RESERVED_IDX] = '{idx: 32'd1, start_addr: tile_reserved_start_addr, end_addr: tile_reserved_end_addr }; assign obi_xbar_en_default_idx = '1; // Routing to the AXI Xbar all requests with an address outside the range of the internal L1 and the external L2 assign obi_xbar_default_idx = '0; @@ -350,7 +401,9 @@ module magia_tile assign hci_clear = 1'b0; assign hci_ctrl = '0; +`ifdef CV32E40X assign redmule_ctrl_req = '0; +`endif assign idma_clear = 1'b0; @@ -372,23 +425,24 @@ module magia_tile {{magia_tile_pkg::SET_OPCODE, magia_tile_pkg::SET_S_FUNC3}} }}; assign xif_coproc_rules[magia_tile_pkg::XIF_FSYNC_IDX] = '{sign_list: '{ default: {magia_tile_pkg::FSYNC_OPCODE, magia_tile_pkg::FSYNC_FUNC3} }}; - assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_EVT_0] = redmule_evt[0][0]; // Only 1 core supported - assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_EVT_1] = redmule_evt[0][1]; // Only 1 core supported - assign irq[magia_tile_pkg::IRQ_IDX_A2O_ERROR] = idma_axi2obi_error; - assign irq[magia_tile_pkg::IRQ_IDX_O2A_ERROR] = idma_obi2axi_error; - assign irq[magia_tile_pkg::IRQ_IDX_A2O_DONE] = idma_axi2obi_done; - assign irq[magia_tile_pkg::IRQ_IDX_O2A_DONE] = idma_obi2axi_done; - assign irq[magia_tile_pkg::IRQ_IDX_A2O_START] = idma_axi2obi_start; - assign irq[magia_tile_pkg::IRQ_IDX_O2A_START] = idma_obi2axi_start; - assign irq[magia_tile_pkg::IRQ_IDX_A2O_BUSY] = idma_axi2obi_busy; - assign irq[magia_tile_pkg::IRQ_IDX_O2A_BUSY] = idma_obi2axi_busy; - assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_BUSY] = redmule_busy; - assign irq[magia_tile_pkg::IRQ_IDX_FSYNC_DONE] = fsync_done; - assign irq[magia_tile_pkg::IRQ_IDX_FSYNC_ERROR] = fsync_error; +`ifdef CV32E40X + assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_EVT_0] = 1'b0; /* redmule_evt[0][0]; */ // Event Unit manages these interrupts // Only 1 core supported + assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_EVT_1] = 1'b0; /* redmule_evt[0][1]; */ // Event Unit manages these interrupts // Only 1 core supported + assign irq[magia_tile_pkg::IRQ_IDX_A2O_ERROR] = 1'b0; /* idma_axi2obi_error; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_O2A_ERROR] = 1'b0; /* idma_obi2axi_error; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_A2O_DONE] = 1'b0; /* idma_axi2obi_done; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_O2A_DONE] = 1'b0; /* idma_obi2axi_done; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_A2O_START] = 1'b0; /* idma_axi2obi_start; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_O2A_START] = 1'b0; /* idma_obi2axi_start; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_A2O_BUSY] = 1'b0; /* idma_axi2obi_busy; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_O2A_BUSY] = 1'b0; /* idma_obi2axi_busy; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_REDMULE_BUSY] = 1'b0; /* redmule_busy; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_FSYNC_DONE] = 1'b0; /* fsync_done; */ // Event Unit manages these interrupts + assign irq[magia_tile_pkg::IRQ_IDX_FSYNC_ERROR] = 1'b0; /* fsync_error; */ // Event Unit manages these interrupts assign irq[magia_pkg::N_IRQ-magia_tile_pkg::IRQ_USED-1:16] = irq_i[magia_pkg::N_IRQ-magia_tile_pkg::IRQ_USED-1:16]; assign irq[15:12] = '0; - assign irq[11] = irq_i[11]; + assign irq[11] = eu_core_irq_req[0]; // Event Unit IRQ mapped to external interrupt (bit 11) /* irq_i[11]; */ assign irq[10:8] = '0; assign irq[7] = irq_i[7]; assign irq[6:4] = '0; @@ -411,6 +465,21 @@ module magia_tile assign xif_redmule_if.compressed_req = '0; assign xif_redmule_if.mem_ready = 1'b0; assign xif_redmule_if.mem_resp = '0; +`else + // Icache control signals + assign enable_prefetching = 1'b0; + assign flush_valid = '0; + + // Event Unit provides unified interrupt management + // External interrupts must be mapped to bit 11 (MEIE - Machine External Interrupt Enable) + assign irq[magia_pkg::N_IRQ-1:12] = '0; // Clear all high IRQs + assign irq[11] = eu_core_irq_req[0]; // Event Unit IRQ mapped to external interrupt (bit 11) + assign irq[10:8] = '0; // Clear IRQs 8-10 + assign irq[7] = 1'b0; // Timer interrupt (unused) + assign irq[6:4] = '0; // Clear IRQs 4-6 + assign irq[3] = 1'b0; // Software interrupt (unused) + assign irq[2:0] = '0; // Clear IRQs 0-2 +`endif assign floo_id = '{x: (x_id_i+1), y: y_id_i, port_id: 0}; @@ -420,14 +489,15 @@ module magia_tile /** Type Conversions Beginning **/ /*******************************************************/ + // Convert core data interface to OBI for crossbar data2obi_req i_core_data2obi_req ( - .data_req_i ( core_data_req ), - .obi_req_o ( core_obi_data_req ) + .data_req_i ( core_data_req_to_xbar ), + .obi_req_o ( core_obi_data_req ) ); obi2data_rsp i_core_obi2data_rsp ( - .obi_rsp_i ( core_obi_data_rsp ), - .data_rsp_o ( core_data_rsp ) + .obi_rsp_i ( core_obi_data_rsp ), + .data_rsp_o ( core_data_rsp_from_xbar ) ); obi2hci_req #( @@ -459,17 +529,17 @@ module magia_tile .axi_rsp_t ( magia_tile_pkg::core_axi_data_rsp_t ), .MaxRequests ( 1 ) ) i_core_data_obi2axi ( - .clk_i ( sys_clk ), - .rst_ni ( rst_ni ), - .obi_req_i ( core_mem_data_req[magia_tile_pkg::L2_IDX] ), - .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::L2_IDX] ), - .user_i ( axi_data_user ), - .axi_req_o ( core_l2_data_req ), - .axi_rsp_i ( core_l2_data_rsp ), - .axi_rsp_channel_sel ( ), - .axi_rsp_b_user_o ( ), - .axi_rsp_r_user_o ( ), - .obi_rsp_user_i ( obi_rsp_data_user ) + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L2_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L2_IDX] ), + .user_i ( axi_data_user ), + .axi_req_o ( core_l2_data_req ), + .axi_rsp_i ( core_l2_data_rsp ), + .axi_rsp_channel_sel ( ), + .axi_rsp_b_user_o ( ), + .axi_rsp_r_user_o ( ), + .obi_rsp_user_i ( obi_rsp_data_user ) ); instr2cache_req i_core_instr2cache_req ( @@ -560,10 +630,44 @@ module magia_tile .rsp_r_user_i ( axi2obi_rsp_r_user ) ); +`ifndef CV32E40X + // RedMule controller OBI-to-HWPE control interface + obi2hwpe_ctrl obi2hwpe_ctrl_inst ( + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_REDMULE_CTRL_IDX] ), + .ctrl_req_o ( redmule_ctrl_req ), + .ctrl_rsp_i ( redmule_ctrl_rsp ) + ); +`endif + /*******************************************************/ /** Type Conversions End **/ /*******************************************************/ -/** Clock gating Beginning **/ +/** Core Data Demux Beginning **/ +/*******************************************************/ + + // Core data demux: splits requests between regular crossbar and EU direct link + core_data_demux_eu_direct i_core_data_demux_eu_direct ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + + // Core interface + .core_data_req_i ( core_data_req ), + .core_data_rsp_o ( core_data_rsp ), + + // Regular crossbar interface + .xbar_data_req_o ( core_data_req_to_xbar ), + .xbar_data_rsp_i ( core_data_rsp_from_xbar ), + + // EU direct link interface + .eu_direct_req_o ( eu_direct_req ), + .eu_direct_rsp_i ( eu_direct_rsp ) + ); + +/*******************************************************/ +/** Core Data Demux End **/ +/*******************************************************/ +/** Clock Gating Beginning **/ /*******************************************************/ always_ff @(posedge clk_i, negedge rst_ni) begin: sys_clk_en_ff @@ -578,8 +682,18 @@ module magia_tile .clk_o ( sys_clk ) ); + // Core clock gating controlled by Event Unit + assign core_clk_en = eu_core_clk_en[0]; // Event Unit controls core clock + + tc_clk_gating core_clock_gating ( + .clk_i ( sys_clk ), + .en_i ( core_clk_en ), + .test_en_i ( test_mode_i ), + .clk_o ( core_clk ) + ); + /*******************************************************/ -/** Clock gating End **/ +/** Clock Gating End **/ /*******************************************************/ /** Interface Definitions Beginning **/ /*******************************************************/ @@ -687,7 +801,11 @@ module magia_tile .N_CORES ( magia_tile_pkg::N_CORE ), .DW ( magia_tile_pkg::REDMULE_DW ), .UW ( magia_tile_pkg::REDMULE_UW ), +`ifdef CV32E40X .X_EXT ( magia_tile_pkg::X_EXT_EN ), +`else + .X_EXT ( 1'b0 ), // RedMulE does not implement the eXtension Interface (X) - using HWPE-CTRL mode +`endif .SysInstWidth ( magia_pkg::INSTR_W ), .SysDataWidth ( magia_pkg::DATA_W ), .redmule_data_req_t ( magia_tile_pkg::redmule_data_req_t ), @@ -702,10 +820,12 @@ module magia_tile .busy_o ( redmule_busy ), .evt_o ( redmule_evt ), +`ifdef CV32E40X .xif_issue_if_i ( xif_coproc_if.coproc_issue[magia_tile_pkg::XIF_REDMULE_IDX] ), .xif_result_if_o ( xif_redmule_if.coproc_result ), .xif_compressed_if_i ( xif_redmule_if.coproc_compressed ), .xif_mem_if_o ( xif_redmule_if.coproc_mem ), +`endif .data_req_o ( redmule_data_req ), .data_rsp_i ( redmule_data_rsp ), @@ -720,6 +840,7 @@ module magia_tile /** Core Beginning **/ /*******************************************************/ +`ifdef CV32E40X // Documentation of cv32e40x_core's design parameters and interface is available at: // https://docs.openhwgroup.org/projects/cv32e40x-user-manual/en/latest/integration.html#core-integration @@ -828,6 +949,114 @@ module magia_tile .core_sleep_o , .wu_wfe_i ); +`else + // flex-v core with integrated FPU and tracer + riscv_core #( + .N_EXT_PERF_COUNTERS ( magia_tile_pkg::N_EXT_PERF_COUNTERS ), + .INSTR_RDATA_WIDTH ( magia_tile_pkg::INSTR_RDATA_WIDTH ), + .PULP_SECURE ( magia_tile_pkg::PULP_SECURE ), + .N_PMP_ENTRIES ( magia_tile_pkg::N_PMP_ENTRIES ), + .USE_PMP ( magia_tile_pkg::USE_PMP ), + .PULP_CLUSTER ( magia_tile_pkg::PULP_CLUSTER ), + .FPU ( magia_tile_pkg::FPU ), + .Zfinx ( magia_tile_pkg::ZFINX ), + .FP_DIVSQRT ( magia_tile_pkg::FP_DIVSQRT ), + .SHARED_FP ( magia_tile_pkg::SHARED_FP ), + .SHARED_DSP_MULT ( magia_tile_pkg::SHARED_DSP_MULT ), + .SHARED_INT_MULT ( magia_tile_pkg::SHARED_INT_MULT ), + .SHARED_INT_DIV ( magia_tile_pkg::SHARED_INT_DIV ), + .SHARED_FP_DIVSQRT ( magia_tile_pkg::SHARED_FP_DIVSQRT ), + .WAPUTYPE ( magia_tile_pkg::WAPUTYPE ), + .APU_NARGS_CPU ( magia_tile_pkg::APU_NARGS_CPU ), + .APU_WOP_CPU ( magia_tile_pkg::APU_WOP_CPU ), + .APU_NDSFLAGS_CPU ( magia_tile_pkg::APU_NDSFLAGS_CPU ), + .APU_NUSFLAGS_CPU ( magia_tile_pkg::APU_NUSFLAGS_CPU ), + .DM_HaltAddress ( magia_tile_pkg::DM_HALT_ADDR ) + ) i_cv32e40p_core ( + // Clock and Reset + .clk_i ( core_clk ), // Use gated clock for core + .rst_ni ( rst_ni ), + + // Clock enable and test mode + .clock_en_i ( sys_clk_en ), + .test_en_i ( test_mode_i ), + + // Floating-point register file disable (for Zfinx) + .fregfile_disable_i ( 1'b0 ), // FPU enabled, use dedicated FP regfile + + // Boot configuration + .boot_addr_i ( boot_addr_i ), + + // Cluster/Core IDs + .cluster_id_i ( mhartid_i[9:4] ), + .core_id_i ( mhartid_i[3:0] ), + + // Instruction memory interface + .instr_req_o ( core_instr_req.req ), + .instr_gnt_i ( core_instr_rsp.gnt ), + .instr_rvalid_i ( core_instr_rsp.rvalid ), + .instr_addr_o ( core_instr_req.addr ), + .instr_rdata_i ( core_instr_rsp.rdata ), + + // Data memory interface + .data_req_o ( core_data_req.req ), + .data_gnt_i ( core_data_rsp.gnt ), + .data_rvalid_i ( core_data_rsp.rvalid ), + .data_addr_o ( core_data_req.addr ), + .data_be_o ( core_data_req.be ), + .data_wdata_o ( core_data_req.wdata ), + .data_we_o ( core_data_req.we ), + .data_rdata_i ( core_data_rsp.rdata ), + + // APU interface (disabled - not connected) + .apu_master_req_o ( ), + .apu_master_ready_o ( ), + .apu_master_gnt_i ( '0 ), + + .apu_master_operands_o ( ), + .apu_master_op_o ( ), + .apu_master_type_o ( ), + .apu_master_flags_o ( ), + + .apu_master_valid_i ( '0 ), + .apu_master_result_i ( '0 ), + .apu_master_flags_i ( '0 ), + + // Interrupts + .irq_i ( eu_core_irq_req[0] ), + .irq_id_i ( '0 ), + .irq_ack_o ( eu_core_irq_ack[0] ), + .irq_id_o ( eu_core_irq_ack_id[0] ), + .irq_sec_i ( '0 ), + + // Security level (unused) + .sec_lvl_o ( ), + + // Debug interface + .debug_req_i ( debug_req_i ), + + // CPU control + .fetch_enable_i ( fetch_enable_i ), + .core_busy_o ( core_busy_o ), + + + // Performance counters + .ext_perf_counters_i ( '0 ) + ); + + assign core_sleep_o = !core_busy_o; + + assign core_instr_req.memtype = 2'b00; + assign core_instr_req.prot = 3'b000; + assign core_instr_req.dbg = 1'b0; + + assign mcycle_o = 64'h0; + assign debug_havereset_o = 1'b0; + assign debug_running_o = 1'b0; + assign debug_halted_o = 1'b0; + assign debug_pc_valid_o = 1'b0; + assign debug_pc_o = 32'h0; +`endif /*******************************************************/ /** Core End **/ @@ -847,16 +1076,19 @@ module magia_tile .LrScEnable ( ), .RegisterAmo ( magia_tile_pkg::RegisterAmo ) ) i_obi_atomics ( - .clk_i ( sys_clk ), - .rst_ni ( rst_ni ), - .testmode_i ( test_mode_i ), - .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::L1SPM_IDX] ), - .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::L1SPM_IDX] ), - .mgr_port_req_o ( core_l1_data_amo_req ), - .mgr_port_rsp_i ( core_l1_data_amo_rsp ) + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .testmode_i ( test_mode_i ), + .sbr_port_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), + .sbr_port_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_L1SPM_IDX] ), + .mgr_port_req_o ( core_l1_data_amo_req ), + .mgr_port_rsp_i ( core_l1_data_amo_rsp ) ); - for (genvar i = 0; i < magia_tile_pkg::N_MGR; i++) begin: gen_obi_xbar_sbr_cut + // Cut only external paths comming from the AXI XBAR + assign obi_xbar_slv_cut_req[0] = obi_xbar_slv_req[0]; + assign obi_xbar_slv_rsp[0] = obi_xbar_slv_cut_rsp[0]; + for (genvar i = 1; i < magia_tile_pkg::N_MGR; i++) begin: gen_obi_xbar_sbr_cut obi_cut #( .ObiCfg ( magia_tile_pkg::obi_amo_cfg ), .obi_a_chan_t ( magia_tile_pkg::core_data_obi_a_chan_t ), @@ -871,7 +1103,7 @@ module magia_tile .mgr_port_req_o ( obi_xbar_slv_cut_req[i] ), .mgr_port_rsp_i ( obi_xbar_slv_cut_rsp[i] ) ); - end + end obi_xbar #( .SbrPortObiCfg ( magia_tile_pkg::obi_amo_cfg ), @@ -1009,6 +1241,7 @@ module magia_tile /** iDMA Beginning **/ /*******************************************************/ +`ifdef CV32E40X idma_ctrl #( .ERROR_CAP ( ERROR_CAP ), .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), @@ -1045,6 +1278,50 @@ module magia_tile .obi2axi_done_o ( idma_obi2axi_done ), .obi2axi_error_o ( idma_obi2axi_error ) ); +`else + idma_ctrl_mm #( + .ERROR_CAP ( ERROR_CAP ), + .obi_req_t ( magia_tile_pkg::core_obi_data_req_t ), + .obi_rsp_t ( magia_tile_pkg::core_obi_data_rsp_t ), + .idma_fe_reg_req_t ( magia_tile_pkg::idma_fe_reg_req_t ), + .idma_fe_reg_rsp_t ( magia_tile_pkg::idma_fe_reg_rsp_t ), + .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), + .axi_rsp_t ( magia_tile_pkg::idma_axi_rsp_t ), + .idma_obi_req_t ( magia_tile_pkg::idma_obi_req_t ), + .idma_obi_rsp_t ( magia_tile_pkg::idma_obi_rsp_t ) + ) i_idma_ctrl_mm ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .test_en_i ( test_mode_i ), + .clear_i ( idma_clear ), + + // OBI Slave Interface (CPU memory-mapped access) + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_IDMA_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_IDMA_IDX] ), + + // AXI Master Interfaces (to L2 memory) + .axi_read_req_o ( idma_axi_read_req ), + .axi_read_rsp_i ( idma_axi_read_rsp ), + .axi_write_req_o ( idma_axi_write_req ), + .axi_write_rsp_i ( idma_axi_write_rsp ), + + // OBI Master Interfaces (to L1 memory) + .obi_read_req_o ( idma_obi_read_req ), + .obi_read_rsp_i ( idma_obi_read_rsp ), + .obi_write_req_o ( idma_obi_write_req ), + .obi_write_rsp_i ( idma_obi_write_rsp ), + + // Serialized IRQ outputs + .irq_a2o_busy_o ( idma_axi2obi_busy ), + .irq_a2o_start_o ( idma_axi2obi_start ), + .irq_a2o_done_o ( idma_axi2obi_done ), + .irq_a2o_error_o ( idma_axi2obi_error ), + .irq_o2a_busy_o ( idma_obi2axi_busy ), + .irq_o2a_start_o ( idma_obi2axi_start ), + .irq_o2a_done_o ( idma_obi2axi_done ), + .irq_o2a_error_o ( idma_obi2axi_error ) + ); +`endif axi_rw_join #( .axi_req_t ( magia_tile_pkg::idma_axi_req_t ), @@ -1146,7 +1423,7 @@ module magia_tile /*******************************************************/ /** Data Out - L2 (AXI XBAR) End **/ /*******************************************************/ -/** FlooNoC modules Beginning **/ +/** FlooNoC Modules Beginning **/ /*******************************************************/ floo_axi_chimney #( @@ -1232,11 +1509,12 @@ module magia_tile assign noc_west_rsp_o = floo_router_rsp_out[3]; /*******************************************************/ -/** FlooNoC modules End **/ +/** FlooNoC Modules End **/ /*******************************************************/ /** Fractal Sync Out Beginning **/ /*******************************************************/ +`ifdef CV32E40X fractal_sync_xif_inst_decoder #( .INSTR_W ( magia_tile_pkg::FSYNC_INSTR_W ), .DATA_W ( magia_tile_pkg::FSYNC_DATA_W ), @@ -1264,6 +1542,28 @@ module magia_tile .done_o ( fsync_done ), .error_o ( fsync_error ) ); +`else + // Fractal Sync OBI Memory-Mapped Slave (replaces XIF interface) + obi_slave_fsync #( + .BASE_ADDR ( magia_tile_pkg::FSYNC_CTRL_ADDR_START ), + .AGGR_W ( magia_tile_pkg::FSYNC_AGGR_W ), + .ID_W ( magia_tile_pkg::FSYNC_ID_W ), + .NBR_AGGR_W ( magia_tile_pkg::FSYNC_NBR_AGGR_W ), + .NBR_ID_W ( magia_tile_pkg::FSYNC_NBR_ID_W ) + ) i_fsync_mm ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .clear_i ( fsync_clear ), + .obi_req_i ( core_mem_data_req[magia_tile_pkg::OBI_XBAR_FSYNC_CTRL_IDX] ), + .obi_rsp_o ( core_mem_data_rsp[magia_tile_pkg::OBI_XBAR_FSYNC_CTRL_IDX] ), + .ht_fsync_if_o ( ht_fsync_if_o ), + .hn_fsync_if_o ( hn_fsync_if_o ), + .vt_fsync_if_o ( vt_fsync_if_o ), + .vn_fsync_if_o ( vn_fsync_if_o ), + .done_o ( fsync_done ), + .error_o ( fsync_error ) + ); +`endif /*******************************************************/ /** Fractal Sync Out End **/ @@ -1271,6 +1571,7 @@ module magia_tile /** Floating-Point Unit Beginning **/ /*******************************************************/ +`ifdef CV32E40X fpu_ss #( .PULP_ZFINX ( magia_tile_pkg::FPU_ZFINX ), .INPUT_BUFFER_DEPTH ( magia_tile_pkg::FPU_BUFFER_DEPTH ), @@ -1331,9 +1632,75 @@ module magia_tile .x_result_ready_o ( x_result_ready ), .x_result_i ( x_result ) ); +`endif /*******************************************************/ /** Floating-Point Unit End **/ /*******************************************************/ +/** Event Unit Beginning **/ +/*******************************************************/ + + // Event array assignments for proper 2D array structure + assign acc_events_array[0] = {redmule_evt[0][1], redmule_evt[0][0], redmule_busy, 1'b0}; + assign dma_events_array[0] = {idma_obi2axi_done, idma_axi2obi_done}; + assign timer_events_array[0] = 2'b00; + assign other_events_array[0] = {idma_obi2axi_busy, idma_axi2obi_busy, idma_obi2axi_start, idma_axi2obi_start, idma_obi2axi_error, idma_axi2obi_error, fsync_error, fsync_done, 24'b0}; // iDMA status events [31:28]|idma_obi2axi_error, idma_axi2obi_error, iDMA error events [27:26]|fsync_error, fsync_done, Fsync events [25:24], Reserved [23:0] - SW events are INTERNAL to Event Unit! + +`ifdef CV32E40X + assign eu_core_irq_ack = eu_core_irq_req; + assign eu_core_irq_ack_id = eu_core_irq_id; + + assign core_busy_o = !core_sleep_o; +`endif + + magia_event_unit #( + .NB_CORES ( 1 ), // Single core system + .NB_SW_EVT ( 1 ), // Minimum 1 SW event to avoid indexing issues (unused but required) + .NB_BARR ( 0 ), // No barriers needed with single core + .NB_HW_MUT ( 0 ), // No mutexes needed with single core + .MUTEX_MSG_W ( 32 ), // Keep default even if unused + .DISP_FIFO_DEPTH ( 0 ), // No task dispatcher needed + .EVNT_WIDTH ( 8 ), // SOC event width (keep default) + .SOC_FIFO_DEPTH ( 8 ) // SOC FIFO depth (keep default) + ) i_magia_event_unit ( + .clk_i ( sys_clk ), + .rst_ni ( rst_ni ), + .test_mode_i ( test_mode_i ), + + // Event inputs - single core arrays + .acc_events_i ( acc_events_array ), // Accelerator events + .dma_events_i ( dma_events_array ), // iDMA completion events + .timer_events_i ( timer_events_array ), + .other_events_i ( other_events_array ), // Combined events + + // Core IRQ interface + .core_irq_req_o ( eu_core_irq_req ), + .core_irq_id_o ( eu_core_irq_id ), + .core_irq_ack_i ( eu_core_irq_ack ), + .core_irq_ack_id_i ( eu_core_irq_ack_id ), + + // Core control + .core_busy_i ( core_busy_o ), + .core_clock_en_o ( eu_core_clk_en ), + + // Debug + .dbg_req_i ( debug_req_i ), + .core_dbg_req_o ( eu_core_dbg_req ), + + // EU Direct Link Interface - abstract types + .eu_direct_req_i ( eu_direct_req.req ), + .eu_direct_addr_i ( eu_direct_req.addr ), + .eu_direct_wen_i ( eu_direct_req.wen ), + .eu_direct_wdata_i ( eu_direct_req.wdata ), + .eu_direct_be_i ( eu_direct_req.be ), + .eu_direct_gnt_o ( eu_direct_rsp.gnt ), + .eu_direct_rvalid_o ( eu_direct_rsp.rvalid ), + .eu_direct_rdata_o ( eu_direct_rsp.rdata ), + .eu_direct_err_o ( eu_direct_rsp.err ) + ); + +/*******************************************************/ +/** Event Unit End **/ +/*******************************************************/ endmodule: magia_tile \ No newline at end of file diff --git a/hw/tile/magia_tile_pkg.sv b/hw/tile/magia_tile_pkg.sv index bc676bd..d3955b3 100644 --- a/hw/tile/magia_tile_pkg.sv +++ b/hw/tile/magia_tile_pkg.sv @@ -15,6 +15,7 @@ * SPDX-License-Identifier: SHL-0.51 * * Authors: Victor Isachi + * Luca Balboni * * MAGIA Tile Package */ @@ -50,20 +51,32 @@ package magia_tile_pkg; localparam int unsigned IRQ_USED = 13; // Address map - localparam logic[magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = 32'h0000_0000; - localparam logic[magia_pkg::ADDR_W-1:0] RESERVED_SIZE = 32'h0001_0000; - localparam logic[magia_pkg::ADDR_W-1:0] RESERVED_ADDR_END = RESERVED_ADDR_START + RESERVED_SIZE; - localparam logic[magia_pkg::ADDR_W-1:0] STACK_ADDR_START = RESERVED_ADDR_END; - localparam logic[magia_pkg::ADDR_W-1:0] STACK_SIZE = 32'h0001_0000; - localparam logic[magia_pkg::ADDR_W-1:0] STACK_ADDR_END = STACK_ADDR_START + STACK_SIZE; - localparam logic[magia_pkg::ADDR_W-1:0] L1_ADDR_START = STACK_ADDR_END; - localparam logic[magia_pkg::ADDR_W-1:0] L1_SIZE = 32'h000E_0000; - localparam logic[magia_pkg::ADDR_W-1:0] L1_ADDR_END = L1_ADDR_START + L1_SIZE; - localparam logic[magia_pkg::ADDR_W-1:0] L1_TILE_OFFSET = 32'h0010_0000; - localparam logic[magia_pkg::ADDR_W-1:0] L2_ADDR_START = 32'hC000_0000; - localparam logic[magia_pkg::ADDR_W-1:0] L2_SIZE = 32'h4000_0000; - localparam logic[magia_pkg::ADDR_W-1:0] L2_ADDR_END = L2_ADDR_START + L2_SIZE; - + localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_START = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] REDMULE_CTRL_ADDR_END = REDMULE_CTRL_ADDR_START + REDMULE_CTRL_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_START = REDMULE_CTRL_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_SIZE = 32'h0000_0400; + localparam logic [magia_pkg::ADDR_W-1:0] IDMA_CTRL_ADDR_END = IDMA_CTRL_ADDR_START + IDMA_CTRL_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_START = IDMA_CTRL_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_SIZE = 32'h0000_0100; + localparam logic [magia_pkg::ADDR_W-1:0] FSYNC_CTRL_ADDR_END = FSYNC_CTRL_ADDR_START + FSYNC_CTRL_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_START = FSYNC_CTRL_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_SIZE = 32'h0000_1000; + localparam logic [magia_pkg::ADDR_W-1:0] EVENT_UNIT_ADDR_END = EVENT_UNIT_ADDR_START + EVENT_UNIT_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_START = EVENT_UNIT_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_SIZE = 32'h0000_E900; // Calculated to make RESERVED_ADDR_END = 0x0001_0000 + localparam logic [magia_pkg::ADDR_W-1:0] RESERVED_ADDR_END = RESERVED_ADDR_START + RESERVED_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_START = RESERVED_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] STACK_SIZE = 32'h0001_0000; + localparam logic [magia_pkg::ADDR_W-1:0] STACK_ADDR_END = STACK_ADDR_START + STACK_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_START = STACK_ADDR_END; + localparam logic [magia_pkg::ADDR_W-1:0] L1_SIZE = 32'h000E_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L1_ADDR_END = L1_ADDR_START + L1_SIZE; + localparam logic [magia_pkg::ADDR_W-1:0] L1_TILE_OFFSET = 32'h0010_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_START = 32'hC000_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L2_SIZE = 32'h4000_0000; + localparam logic [magia_pkg::ADDR_W-1:0] L2_ADDR_END = L2_ADDR_START + L2_SIZE; + // Parameters used by the HCI parameter int unsigned N_HWPE = 1; // Number of HWPEs attached to the port parameter int unsigned N_CORE = 1; // Number of Core ports @@ -94,9 +107,8 @@ package magia_tile_pkg; localparam int unsigned SWH = DWH/BWH; // Strobe Width for HWPE Interconnect localparam int unsigned WDH = DWH/WWH; // Number of words per data for HWPE Interconnect - // Parameters used by the core - parameter bit X_EXT_EN = 1; // Enable eXtension Interface (X) support, see eXtension Interface - parameter int unsigned X_NUM_RS = 3; // Number of register file read ports that can be used by the eXtension interface + // Parameters used by the cv32e40x core + parameter bit X_EXT_EN = 1; // Enable eXtension Interface (X) support, see eXtension Interface parameter int unsigned X_ID_W = 4; // Identification width for the eXtension interface parameter int unsigned X_MEM_W = 32; // Memory access width for loads/stores via the eXtension interface parameter int unsigned X_RFR_W = 32; // Register file read access width for the eXtension interface @@ -106,7 +118,39 @@ package magia_tile_pkg; parameter bit[31:0] DM_REGION_START = 32'hF0000000; // Start address of Debug Module region, see Debug & Trigger parameter bit[31:0] DM_REGION_END = 32'hF0003FFF; // End address of Debug Module region, see Debug & Trigger parameter bit CLIC_EN = 1'b0; // Specifies whether Smclic, Smclicshv and Smclicconfig are supported + + // Parameters used by cv32e40p core + parameter int unsigned N_EXT_PERF_COUNTERS = 0; // Number of external performance counters + parameter int unsigned INSTR_RDATA_WIDTH = 32; // Instruction data width + parameter bit PULP_SECURE = 1'b0; // PULP security features + parameter int unsigned N_PMP_ENTRIES = 16; // Number of PMP entries + parameter bit USE_PMP = 1'b1; // Enable PMP + parameter bit PULP_CLUSTER = 1'b1; // PULP cluster mode + parameter bit FPU = 1'b1; // Enable FPU (main feature) + parameter bit ZFINX = 1'b0; // Zfinx extension (integer FP in GPR) - Must be 0 for standard FPU + parameter bit FP_DIVSQRT = 1'b1; // FP division and square root + parameter bit SHARED_FP = 1'b0; // Shared FP unit + parameter bit SHARED_DSP_MULT = 1'b0; // Shared DSP multiplier + parameter bit SHARED_INT_MULT = 1'b0; // Shared integer multiplier + parameter bit SHARED_INT_DIV = 1'b0; // Shared integer divider + parameter bit SHARED_FP_DIVSQRT = 1'b0; // Shared FP div/sqrt + parameter int unsigned WAPUTYPE = 0; // APU type width + parameter int unsigned APU_NARGS_CPU = 3; // APU number of arguments + parameter int unsigned APU_WOP_CPU = 6; // APU operation width + parameter int unsigned APU_NDSFLAGS_CPU = 15; // APU data side flags + parameter int unsigned APU_NUSFLAGS_CPU = 5; // APU user side flags + parameter logic[31:0] DM_HALT_ADDR = 32'h1A110800; // Debug module halt address + +`ifdef CV32E40X + parameter int unsigned X_NUM_RS = 3; // Number of register file read ports that can be used by the eXtension interface parameter int unsigned CLIC_ID_W = 1; // Width of clic_irq_id_i and clic_irq_id_o. The maximum number of supported interrupts in CLIC mode is 2^CLIC_ID_WIDTH. Trap vector table alignment is restricted as described in Machine Trap Vector Table Base Address (mtvt) +`else + parameter int unsigned X_NUM_RS = 2; // Number of register file read ports (R-type instructions have 2 source operands) + parameter int unsigned CLIC_ID_W = 5; // CLIC interrupt ID width (5 bits for 32 interrupts) +`endif + + // Parameters used by Event Unit + parameter int unsigned EVENT_UNIT_IRQ_WIDTH = 5; // Width of Event Unit IRQ ID signals (supports up to 32 different event types) // Parameters used by RedMulE parameter int unsigned REDMULE_DW = DWH; // RedMulE Data Width @@ -124,10 +168,18 @@ package magia_tile_pkg; parameter int unsigned RID_WIDTH = 1; // Width of the rid signal (response channel identifier, see OBI documentation) parameter int unsigned MID_WIDTH = 1; // Width of the mid signal (manager identifier, see OBI documentation) parameter int unsigned OBI_ID_WIDTH = 1; // Width of the id - configuration +`ifdef CV32E40X parameter int unsigned N_SBR = 2; // Number of slaves (HCI, AXI XBAR) +`else + parameter int unsigned N_SBR = 5; // Number of slaves (HCI, AXI XBAR, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link +`endif parameter int unsigned N_MGR = 2; // Number of masters (Core, AXI XBAR) parameter int unsigned N_MAX_TRAN = 1; // Number of maximum outstanding transactions +`ifdef CV32E40X parameter int unsigned N_ADDR_RULE = 4; // Number of address rules (L2, L1, Stack, Reserved) +`else + parameter int unsigned N_ADDR_RULE = 7; // Number of address rules (L2, L1, Stack, Reserved, RedMulE_Ctrl, iDMA_Ctrl, FSync_Ctrl) - Event_Unit now via eu_direct_link +`endif localparam int unsigned N_BIT_SBR = $clog2(N_SBR); // Number of bits required to identify each slave // Parameters used by AXI @@ -307,7 +359,7 @@ package magia_tile_pkg; }, PipeConfig: fpnew_pkg::DISTRIBUTED }; // FPU implementation - + typedef struct packed { int unsigned idx; logic[magia_pkg::ADDR_W-1:0] start_addr; @@ -334,6 +386,7 @@ package magia_tile_pkg; logic err; } core_instr_rsp_t; +`ifdef CV32E40X typedef struct packed { logic req; logic[magia_pkg::ADDR_W-1:0] addr; @@ -353,6 +406,38 @@ package magia_tile_pkg; logic err; logic exokay; } core_data_rsp_t; +`else + typedef struct packed { + logic req; + logic[magia_pkg::ADDR_W-1:0] addr; + logic[3 :0] be; + logic[magia_pkg::DATA_W-1:0] wdata; + logic we; + } core_data_req_t; + + typedef struct packed { + logic gnt; + logic rvalid; + logic[magia_pkg::DATA_W-1:0] rdata; + logic err; + } core_data_rsp_t; +`endif + + // EU Direct Link interface types + typedef struct packed { + logic req; + logic[magia_pkg::ADDR_W-1:0] addr; + logic wen; // Write enable negated (EU convention) + logic[magia_pkg::DATA_W-1:0] wdata; + logic[3 :0] be; + } eu_direct_req_t; + + typedef struct packed { + logic gnt; + logic rvalid; + logic[magia_pkg::DATA_W-1:0] rdata; + logic err; // Error signal (r_opc from XBAR_PERIPH_BUS) + } eu_direct_rsp_t; typedef struct packed { logic[NR_FETCH_PORTS-1:0] req; @@ -366,12 +451,31 @@ package magia_tile_pkg; logic[NR_FETCH_PORTS-1:0] rerror; } core_cache_instr_rsp_t; +`ifdef CV32E40X + typedef enum logic[1:0]{ + OBI_XBAR_STACK_IDX = 3, + OBI_XBAR_RESERVED_IDX = 2, + OBI_XBAR_L1SPM_IDX = 1, + OBI_XBAR_L2_IDX = 0 + } obi_mem_array_idx_e; +`else + typedef enum logic[2:0]{ + OBI_XBAR_STACK_IDX = 6, + OBI_XBAR_RESERVED_IDX = 5, + OBI_XBAR_FSYNC_CTRL_IDX = 4, + OBI_XBAR_IDMA_IDX = 3, + OBI_XBAR_REDMULE_CTRL_IDX = 2, + OBI_XBAR_L1SPM_IDX = 1, + OBI_XBAR_L2_IDX = 0 + } obi_mem_array_idx_e; +`endif + typedef enum logic[1:0]{ - STACK_IDX = 3, - RESERVED_IDX = 2, - L1SPM_IDX = 1, - L2_IDX = 0 - } mem_array_idx_e; + AXI_XBAR_STACK_IDX = 3, + AXI_XBAR_RESERVED_IDX = 2, + AXI_XBAR_L1SPM_IDX = 1, + AXI_XBAR_L2_IDX = 0 + } axi_mem_array_idx_e; typedef enum logic[1:0]{ AXI_EXT_IDX = 3, diff --git a/hw/tile/obi_slave_fsync.sv b/hw/tile/obi_slave_fsync.sv new file mode 100644 index 0000000..e1a0b9f --- /dev/null +++ b/hw/tile/obi_slave_fsync.sv @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Solderpad Hardware License, Version 0.51 + * (the "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: SHL-0.51 + * + * Authors: Luca Balboni + * Based on fractal_sync_xif_inst_decoder by Victor Isachi + * + * OBI Slave Fractal Sync Memory-Mapped Controller + * Replaces XIF interface with memory-mapped register access + */ + +module obi_slave_fsync + import magia_tile_pkg::*; + import magia_pkg::*; +#( + parameter logic [ADDR_W-1:0] BASE_ADDR = magia_tile_pkg::FSYNC_CTRL_ADDR_START, + parameter int unsigned AGGR_W = magia_tile_pkg::FSYNC_AGGR_W, + parameter int unsigned ID_W = magia_tile_pkg::FSYNC_ID_W, + parameter int unsigned NBR_AGGR_W = magia_tile_pkg::FSYNC_NBR_AGGR_W, + parameter int unsigned NBR_ID_W = magia_tile_pkg::FSYNC_NBR_ID_W, + parameter type obi_req_t = magia_tile_pkg::core_obi_data_req_t, + parameter type obi_rsp_t = magia_tile_pkg::core_obi_data_rsp_t +)( + input logic clk_i, + input logic rst_ni, + input logic clear_i, + + input obi_req_t obi_req_i, + output obi_rsp_t obi_rsp_o, + + fractal_sync_if.mst_port ht_fsync_if_o, + fractal_sync_if.mst_port hn_fsync_if_o, + fractal_sync_if.mst_port vt_fsync_if_o, + fractal_sync_if.mst_port vn_fsync_if_o, + + output logic done_o, + output logic error_o +); + +/*******************************************************/ +/** Internal Signal Definitions Beginning **/ +/*******************************************************/ + + logic clk_sync_en, clk_reg_en; + logic clk_sync_g, clk_reg_g; + + logic sync_trigger; + logic done; + logic addr_match; + + logic[DATA_W-1:0] aggr_reg, id_reg, status_reg, control_reg; + + typedef enum logic[1:0] { + IDLE, + SYNC, + WAIT, + DONE + } sync_state_e; + + sync_state_e c_sync_state, n_sync_state; + + // Memory Map: + // BASE_ADDR + 0x00: AGGR_REG (write-only) + // BASE_ADDR + 0x04: ID_REG (write-only) + // BASE_ADDR + 0x08: CONTROL_REG (write-only, writing triggers sync) + // BASE_ADDR + 0x0C: STATUS_REG (read-only) + localparam logic [ADDR_W-1:0] AGGR_REG_OFFSET = 4'h0; + localparam logic [ADDR_W-1:0] ID_REG_OFFSET = 4'h4; + localparam logic [ADDR_W-1:0] CONTROL_REG_OFFSET = 4'h8; + localparam logic [ADDR_W-1:0] STATUS_REG_OFFSET = 4'hC; + +/*******************************************************/ +/** Internal Signal Definitions End **/ +/*******************************************************/ +/** Hardwired Signals Beginning **/ +/*******************************************************/ + + assign addr_match = (obi_req_i.a.addr >= BASE_ADDR) && + (obi_req_i.a.addr < BASE_ADDR + 32'h100); + + assign done_o = done; + assign error_o = ht_fsync_if_o.error | hn_fsync_if_o.error | + vt_fsync_if_o.error | vn_fsync_if_o.error; + + // Status register: bit 0 = done, bit 1 = error, bit 2 = busy + // For polling: when busy=0, operation is complete + assign status_reg = {29'b0, (c_sync_state == SYNC || c_sync_state == WAIT), error_o, done}; + +/*******************************************************/ +/** Hardwired Signals End **/ +/*******************************************************/ +/** Clock gating Beginning **/ +/*******************************************************/ + + tc_clk_gating i_reg_clock_gating ( + .clk_i , + .en_i ( clk_reg_en ), + .test_en_i ( '0 ), + .clk_o ( clk_reg_g ) + ); + + tc_clk_gating i_sync_clock_gating ( + .clk_i , + .en_i ( clk_sync_en ), + .test_en_i ( '0 ), + .clk_o ( clk_sync_g ) + ); + +/*******************************************************/ +/** Clock gating End **/ +/*******************************************************/ +/** OBI Interface Logic Beginning **/ +/*******************************************************/ + + always_comb begin: obi_interface + obi_rsp_o = '0; + sync_trigger = 1'b0; + clk_reg_en = 1'b0; + + if (obi_req_i.req && addr_match) begin + obi_rsp_o.gnt = 1'b1; + obi_rsp_o.rvalid = 1'b1; + clk_reg_en = 1'b1; // Enable clock for OBI register access + + // OBI protocol: assign response ID and optional fields + obi_rsp_o.r.rid = obi_req_i.a.aid; + obi_rsp_o.r.r_optional = '0; + obi_rsp_o.r.err = 1'b0; + + if (obi_req_i.a.we) begin + // Write operation + case (obi_req_i.a.addr - BASE_ADDR) + CONTROL_REG_OFFSET: begin + sync_trigger = 1'b1; // Writing to control register triggers sync + end + default: begin + // Writes to AGGR_REG and ID_REG are handled in register logic + end + endcase + end else begin + // Read operation + case (obi_req_i.a.addr - BASE_ADDR) + STATUS_REG_OFFSET: begin + obi_rsp_o.r.rdata = status_reg; + end + default: begin + obi_rsp_o.r.rdata = 32'h0; // Return 0 for write-only registers + end + endcase + end + end + end + +/*******************************************************/ +/** OBI Interface Logic End **/ +/*******************************************************/ +/** Register Logic Beginning **/ +/*******************************************************/ + + always_ff @(posedge clk_reg_g, negedge rst_ni) begin: configuration_registers + if (~rst_ni) begin + aggr_reg <= '0; + id_reg <= '0; + end else begin + if (clear_i) begin + aggr_reg <= '0; + id_reg <= '0; + end else if (obi_req_i.req && addr_match && obi_req_i.a.we) begin + case (obi_req_i.a.addr - BASE_ADDR) + AGGR_REG_OFFSET: begin + aggr_reg <= obi_req_i.a.wdata; + end + ID_REG_OFFSET: begin + id_reg <= obi_req_i.a.wdata; + end + endcase + end + end + end + +/*******************************************************/ +/** Register Logic End **/ +/*******************************************************/ +/** Synchronization FSM Beginning **/ +/*******************************************************/ + + always_comb begin: sync_logic + n_sync_state = c_sync_state; + clk_sync_en = 1'b1; + done = 1'b0; + ht_fsync_if_o.sync = 1'b0; + ht_fsync_if_o.aggr = '0; + ht_fsync_if_o.id_req = '0; + hn_fsync_if_o.sync = 1'b0; + hn_fsync_if_o.aggr = '0; + hn_fsync_if_o.id_req = '0; + vt_fsync_if_o.sync = 1'b0; + vt_fsync_if_o.aggr = '0; + vt_fsync_if_o.id_req = '0; + vn_fsync_if_o.sync = 1'b0; + vn_fsync_if_o.aggr = '0; + vn_fsync_if_o.id_req = '0; + + case (c_sync_state) + IDLE: begin + if (sync_trigger) begin + n_sync_state = SYNC; + end else begin + clk_sync_en = 1'b0; + end + end + + SYNC: begin + n_sync_state = WAIT; + if (aggr_reg != 1) begin // Tree (level > 1) request + case (id_reg[0]) + 1'b0: begin // Horizontal tree node request + ht_fsync_if_o.sync = 1'b1; + ht_fsync_if_o.aggr = aggr_reg[AGGR_W-1:0]; + ht_fsync_if_o.id_req = id_reg[ID_W-1:0]; + end + 1'b1: begin // Vertical tree node request + vt_fsync_if_o.sync = 1'b1; + vt_fsync_if_o.aggr = aggr_reg[AGGR_W-1:0]; + vt_fsync_if_o.id_req = id_reg[ID_W-1:0]; + end + endcase + end else begin // Neighbor (level = 1) request + case (id_reg[1:0]) + 2'b00: begin // Horizontal tree node request + ht_fsync_if_o.sync = 1'b1; + ht_fsync_if_o.aggr = aggr_reg[AGGR_W-1:0]; + ht_fsync_if_o.id_req = id_reg[ID_W-1:0]; + end + 2'b01: begin // Vertical tree node request + vt_fsync_if_o.sync = 1'b1; + vt_fsync_if_o.aggr = aggr_reg[AGGR_W-1:0]; + vt_fsync_if_o.id_req = id_reg[ID_W-1:0]; + end + 2'b10: begin // Horizontal neighbor node request + hn_fsync_if_o.sync = 1'b1; + hn_fsync_if_o.aggr = aggr_reg[NBR_AGGR_W-1:0]; + hn_fsync_if_o.id_req = id_reg[NBR_ID_W-1:0]; + end + 2'b11: begin // Vertical neighbor node request + vn_fsync_if_o.sync = 1'b1; + vn_fsync_if_o.aggr = aggr_reg[NBR_AGGR_W-1:0]; + vn_fsync_if_o.id_req = id_reg[NBR_ID_W-1:0]; + end + endcase + end + end + + WAIT: begin + if (ht_fsync_if_o.wake | hn_fsync_if_o.wake | vt_fsync_if_o.wake | vn_fsync_if_o.wake) begin + n_sync_state = DONE; + end else begin + n_sync_state = WAIT; + end + end + + DONE: begin + n_sync_state = IDLE; + done = 1'b1; + end + endcase + end + + always_ff @(posedge clk_sync_g, negedge rst_ni) begin: sync_state + if (~rst_ni) c_sync_state <= IDLE; + else begin + if (clear_i) c_sync_state <= IDLE; + else c_sync_state <= n_sync_state; + end + end + +/*******************************************************/ +/** Synchronization FSM End **/ +/*******************************************************/ + +endmodule: obi_slave_fsync \ No newline at end of file diff --git a/hw/tile/xbar_periph_bus_if.sv b/hw/tile/xbar_periph_bus_if.sv new file mode 100644 index 0000000..df7d5cb --- /dev/null +++ b/hw/tile/xbar_periph_bus_if.sv @@ -0,0 +1,42 @@ +// Copyright 2018 ETH Zurich and University of Bologna. +// Copyright and related rights are licensed under the Solderpad Hardware +// License, Version 0.51 (the "License"); you may not use this file except in +// compliance with the License. You may obtain a copy of the License at +// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law +// or agreed to in writing, software, hardware and materials distributed under +// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// +// Authors: Luca Balboni + +interface XBAR_PERIPH_BUS #( + parameter int ID_WIDTH = 2 // typically number of cores plus one +); + + // Request Channel + logic req; + logic [31:0] add; + logic wen; + logic [31:0] wdata; + logic [3:0] be; + logic gnt; + logic [ID_WIDTH-1:0] id; + + // Response Channel + logic r_valid; + logic r_opc; + logic [ID_WIDTH-1:0] r_id; + logic [31:0] r_rdata; + + modport Master ( + output req, add, wen, wdata, be, id, + input gnt, r_rdata, r_opc, r_id, r_valid + ); + + modport Slave ( + input req, add, wen, wdata, be, id, + output gnt, r_rdata, r_opc, r_id, r_valid + ); + +endinterface \ No newline at end of file diff --git a/setup_env.sh b/setup_env.sh index f19f95e..ed0bd50 100644 --- a/setup_env.sh +++ b/setup_env.sh @@ -1,3 +1,6 @@ +core="$1" +echo "Selected core: $core" + export MAGIA_DIR=$(pwd) echo "Exporting MAGIA path to $MAGIA_DIR" export PATH=$MAGIA_DIR:$PATH @@ -11,7 +14,13 @@ export PATH=/usr/pack/gcc-5.2.0-af/x86_64-rhe6-linux/bin:$PATH export PATH=/usr/local/anaconda3-2023.07/condabin:$PATH export PATH=/home/visachi/.local/bin:$PATH export XLEN=32 -export XTEN=imafc +if [[ "$core" == "CV32E40P" ]]; then + echo "Exporting ISA extentions: I, M, F, C, XPULP_V2" + export XTEN=imfcxpulpv2 +else + echo "Exporting ISA extentions: I, M, A, F, C" + export XTEN=imafc +fi echo "Sourcing python virtual environment" source ./magia_venv/bin/activate echo "Finished setting up the environment" \ No newline at end of file diff --git a/sw/.gitignore b/sw/.gitignore deleted file mode 100644 index 26abe44..0000000 --- a/sw/.gitignore +++ /dev/null @@ -1,13 +0,0 @@ -tests/amo_test/* -tests/boot_test/* -tests/fpu_test/* -tests/fsync_extended_test/* -tests/fsync_test/* -tests/hello_mesh/* -tests/hello_world/* -tests/idma_test/* -tests/inter_l1_test/* -tests/l1_test/* -tests/mesh_test/* -tests/redmule_test/* -tests/tile_test/* \ No newline at end of file diff --git a/sw/tests/eu_tests/event_unit_test.c b/sw/tests/eu_tests/event_unit_test.c new file mode 100644 index 0000000..c93a34a --- /dev/null +++ b/sw/tests/eu_tests/event_unit_test.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * + * MAGIA Event Unit Tile Stress Test - Event Unit WFE API Version + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * Tests concurrent RedMulE and IDMA operations with out-of-order completions + * + */ + +#include +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" +#include "idma_mm_utils.h" +#include "event_unit_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE_1 (L1_BASE + 0x00012048) +#define W_BASE_1 (L1_BASE + 0x00016048) +#define Y_BASE_1 (L1_BASE + 0x0001A048) +#define X_BASE_2 (L1_BASE + 0x0001E048) +#define W_BASE_2 (L1_BASE + 0x00022048) +#define Y_BASE_2 (L1_BASE + 0x00026048) + +#define Z_BASE_1 (L2_BASE + 0x00001000) +#define Z_BASE_2 (L2_BASE + 0x00005000) +#define Z_BASE_4 (L2_BASE + 0x0000D000) + +#define DMA_BUFFER_1 (L1_BASE + 0x00036048) +#define DMA_BUFFER_2 (L1_BASE + 0x0003A048) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (1) + +#define USE_WFE (1) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +#define DMA_CHUNK_SIZE (M_SIZE * N_SIZE * 2) + +int main(void) { + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + uint32_t redmule_completed = 0; + uint32_t idma_a2o_completed = 0; + uint32_t idma_o2a_completed = 0; + + // Initialize Event Unit once + eu_init(); + + // Setup test data + printf("Setting up test data...\n"); + + // X matrix for RedMulE + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(X_BASE_1 + 2*i) = x_inp[i]; + + // W matrix for RedMulE + for (int i = 0; i < N_SIZE*K_SIZE; i++) + mmio16(W_BASE_1 + 2*i) = w_inp[i]; + + // Y matrix (accumulator) for RedMulE + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Y_BASE_1 + 2*i) = y_inp[i]; + + // Z - golden (reference) for RedMulE + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Z_BASE_1 + 2*i) = z_oup[i]; + + // Initialize IDMA test data + for (int i = 0; i < DMA_CHUNK_SIZE/2; i++) { + uint16_t test_pattern = (uint16_t)(0x1000 + (i & 0xFFF)); + mmio16(Z_BASE_4 + 2*i) = test_pattern; + } + +#if VERBOSE > 10 + printf("Test data setup complete\n"); +#endif + + printf("Testing concurrent RedMulE and IDMA operations...\n"); + + // Initialize Event Unit BEFORE launching operations + eu_multi_init(1, 1, 1, 0); // Enable RedMulE, IDMA A2O, IDMA O2A, disable FSync + + // Launch RedMulE operation + printf("Launching RedMulE operation...\n"); + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE_1, (unsigned int)W_BASE_1, (unsigned int)Y_BASE_1, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + // Launch IDMA operations + printf("Launching IDMA operations...\n"); + + // First IDMA transfer: L2 to L1 + dst_addr = (uint32_t)DMA_BUFFER_1; + src_addr = (uint32_t)Z_BASE_4; + len = (uint32_t)DMA_CHUNK_SIZE; + + uint32_t transfer_id_1 = idma_L2ToL1(src_addr, dst_addr, len); + + + // Second IDMA transfer: L1 to L2 + dst_addr = (uint32_t)Z_BASE_2; + src_addr = (uint32_t)DMA_BUFFER_1; + len = (uint32_t)DMA_CHUNK_SIZE; + + uint32_t transfer_id_2 = idma_L1ToL2(src_addr, dst_addr, len); + printf("iDMA transfer 2 (L1->L2) started, ID: %d\n", transfer_id_2); + + // Trigger RedMulE after IDMA to create concurrency + hwpe_trigger_job(); + + // Wait for ALL accelerators using eu_multi_wait_all - elegante! + printf("Waiting for ALL accelerators completion (RedMulE + IDMA A2O + IDMA O2A)...\n"); + + eu_wait_mode_t wait_mode = USE_WFE ? EU_WAIT_MODE_WFE : EU_WAIT_MODE_POLLING; + uint32_t all_events = eu_multi_wait_all(1, 1, 1, 0, wait_mode); + + // eu_multi_wait_all returns only when ALL events are present (or 0 on timeout) + if (all_events) { + redmule_completed = 1; + idma_a2o_completed = 1; + idma_o2a_completed = 1; + } + // If all_events == 0, it means timeout occurred + + // Check for timeout + if (!(redmule_completed && idma_a2o_completed && idma_o2a_completed)) { + mmio16(TEST_END_ADDR) = FAIL_EXIT_CODE; + return 1; + } + + // Disable RedMulE + hwpe_cg_disable(); + + unsigned int num_errors = 0; + + // Verify RedMulE results + uint16_t computed, expected, diff; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed = mmio16(Y_BASE_1 + 2*i); + expected = mmio16(Z_BASE_1 + 2*i); + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if(diff > DIFF_TH){ + num_errors++; + } + } + + // Verify IDMA results (basic integrity check) + uint32_t idma_errors = 0; + for(int i = 0; i < 100; i++) { // Check first 100 elements + uint16_t source_data = mmio16(Z_BASE_4 + 2*i); + uint16_t copied_data = mmio16(DMA_BUFFER_1 + 2*i); + if(source_data != copied_data) { + idma_errors++; + } + } + + num_errors += idma_errors; + + // Event Unit integrity check + if (!(redmule_completed && idma_a2o_completed && idma_o2a_completed)) { + num_errors++; + } + + printf("Finished test with %0d errors\n", num_errors); + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/fsync_extended_test_event_unit.c b/sw/tests/eu_tests/fsync_extended_test_event_unit.c new file mode 100644 index 0000000..a3125b5 --- /dev/null +++ b/sw/tests/eu_tests/fsync_extended_test_event_unit.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_extended_test.c by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Synchronization Test - Event Unit Version + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "fsync_mm_utils.h" +#include "fsync_mm_api.h" +#include "event_unit_utils.h" +#include "cache_fill.h" + +#define VERBOSE (0) + +#define USE_WFE (1) + +int main(void) { + uint32_t aggregates[NUM_HARTS]; + uint32_t ids[NUM_HARTS]; + + // Initialize Event Unit once + eu_init(); + +#if NUM_HARTS == 16 + /// Custom 4x4 synch. + switch (get_hartid()){ + case 0: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 1: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 2: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 3: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 4: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 5: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 6: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 7: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 8: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 9: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 10: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 11: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 12: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 13: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 14: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 15: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + } + + // h_pprintf("FractalSync aggregate: 0b"); pprintf(bs(aggregates[get_hartid()])); pprintf(", id: "); pprintf(ds(ids[get_hartid()])); n_pprintf("..."); + printf("FractalSync aggregate: 0x%0x, id: %0d...\n", aggregates[get_hartid()], ids[get_hartid()]); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm(ids[get_hartid()], aggregates[get_hartid()]); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); +#endif + + printf("[FractalSync MM] Horizontal neighbor test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_hnbr(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + + sentinel_instr_id(); + printf("[FractalSync MM] Horizontal neighbor test ending\n"); + + printf("[FractalSync MM] Horizontal ring neighbor test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_hring(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + + sentinel_instr_id(); + printf("[FractalSync MM] Horizontal ring neighbor test ending\n"); + + printf("[FractalSync MM] Vertical neighbor test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_vnbr(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); + printf("[FractalSync MM] Vertical neighbor test ending\n"); + + printf("[FractalSync MM] Vertical ring neighbor test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_vring(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); + printf("[FractalSync MM] Vertical ring neighbor test ending\n"); + + printf("[FractalSync MM] Row test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_rows(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); + printf("[FractalSync MM] Row test ending\n"); + + printf("[FractalSync MM] Column test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_cols(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); + printf("[FractalSync MM] Column test ending\n"); + + printf("[FractalSync MM] Global test starting\n"); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_global(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + sentinel_instr_id(); + printf("[FractalSync MM] Global test ending\n"); + + // h_pprintf("FractalSync test finished...\n"); + printf("FractalSync MM test finished...\n"); + + + + return 0; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/fsync_test_event_unit.c b/sw/tests/eu_tests/fsync_test_event_unit.c new file mode 100644 index 0000000..be760c3 --- /dev/null +++ b/sw/tests/eu_tests/fsync_test_event_unit.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_test_mm.c by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Synchronization Test - Event Unit Version + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "fsync_mm_utils.h" +#include "fsync_mm_api.h" +#include "event_unit_utils.h" +#include "cache_fill.h" + +#define VERBOSE (0) + +#define USE_WFE (1) + +#define CLIB_FS_MM_TEST +// #define GLOBAL_FS_MM_TEST +// #define ROW_FS_MM_TEST +// #define COL_FS_MM_TEST +// #define HNBR_FS_MM_TEST +// #define VNBR_FS_MM_TEST +// #define HRING_FS_MM_TEST +// #define VRING_FS_MM_TEST + +#define NUM_LEVELS (31-__builtin_clz(NUM_HARTS)) + + +#define CACHE_HEAT_CYCLES (3) + +int main(void) { + uint32_t tile_hartid = get_hartid(); + uint32_t tile_xhartid = GET_X_ID(tile_hartid); + uint32_t tile_yhartid = GET_Y_ID(tile_hartid); + + // Initialize Event Unit once + eu_init(); + + printf("Starting Fractal Sync Memory-Mapped test...\n"); + + // Filling up the cache + fill_icache(); + + // Execute synchronization multiple times to pre-heat the cache + for (int i = 0; i < CACHE_HEAT_CYCLES; i++) { +#ifdef CLIB_FS_MM_TEST + // Climb FS tree test using memory-mapped interface + for (int i = 0; i < NUM_LEVELS; i++){ + printf("Fractal Sync at level %0d...\n", i+1); + + uint32_t aggregates = (1 << (i+1))-1; + uint32_t ids = 0; +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", aggregates); + printf("id: 0x%0x\n", ids); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm(ids, aggregates); + + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + + printf("Synchronized...\n"); + } +#endif + +#ifdef GLOBAL_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync global synchronization test...\n"); +#endif + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_GLOBAL_AGGR); + printf("id: 0x%0x\n", _FS_MM_GLOBAL_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_global(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef HNBR_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync horizontal neighbor synchronization test...\n"); +#endif + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_HNBR_AGGR); + printf("id: 0x%0x\n", _FS_MM_HNBR_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_hnbr(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef VNBR_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync vertical neighbor synchronization test...\n"); +#endif + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_VNBR_AGGR); + printf("id: 0x%0x\n", _FS_MM_VNBR_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_vnbr(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef HRING_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync horizontal ring synchronization test...\n"); +#endif + +#if VERBOSE > 10 + if ((tile_xhartid == 0) || (tile_xhartid == MESH_X_TILES-1)){ + uint32_t id = row_id_lookup_mm(tile_yhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_LVL); + printf("id: 0x%0x\n", id); + } else { + printf("aggregate: 0x%0x\n", _FS_MM_HRING_AGGR); + printf("id: 0x%0x\n", _FS_MM_HRING_ID); + } +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_hring(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef VRING_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync vertical ring synchronization test...\n"); +#endif + +#if VERBOSE > 10 + if ((tile_yhartid == 0) || (tile_yhartid == MESH_Y_TILES-1)){ + uint32_t id = col_id_lookup_mm(tile_xhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_LVL); + printf("id: 0x%0x\n", id); + } else { + printf("aggregate: 0x%0x\n", _FS_MM_VRING_AGGR); + printf("id: 0x%0x\n", _FS_MM_VRING_ID); + } +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_vring(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef ROW_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync row synchronization test...\n"); +#endif + +#if VERBOSE > 10 + uint32_t id = row_id_lookup_mm(tile_yhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_AGGR); + printf("id: 0x%0x\n", id); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_rows(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef COL_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync column synchronization test...\n"); +#endif + +#if VERBOSE > 10 + uint32_t id = col_id_lookup_mm(tile_xhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_AGGR); + printf("id: 0x%0x\n", id); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + // Clear Event Unit and ensure FSync mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); + + fsync_mm_cols(); + + if (USE_WFE) { + eu_fsync_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); + } + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + } + + printf("Fractal Sync Memory-Mapped test finished...\n"); + + + return 0; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/idma_test_event_unit.c b/sw/tests/eu_tests/idma_test_event_unit.c new file mode 100644 index 0000000..988d8d9 --- /dev/null +++ b/sw/tests/eu_tests/idma_test_event_unit.c @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on idma_test.c by Victor Isachi + * + * MAGIA iDMA Test - Event Unit WFE API Version + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * + */ + +#include "magia_tile_utils.h" +#include "idma_mm_utils.h" +#include "event_unit_utils.h" + +#include "x_input.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define Y_BASE (L1_BASE + 0x00016048) +#define Z_BASE (L2_BASE + 0x00001000) +#define W_BASE (L2_BASE + 0x00005000) + +#define M_SIZE (96) +#define N_SIZE (64) + +#define VERBOSE (0) + +#define USE_WFE (1) + +#define WAIT_CYCLES (10) + +#define CONCURRENT + +int main(void) { + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + uint32_t dst_std_2; + uint32_t src_std_2; + uint32_t reps_2; + + uint32_t dst_std_3; + uint32_t src_std_3; + uint32_t reps_3; + + // Initialize Event Unit once + eu_init(); + + // Z - golden (reference) + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(Z_BASE + 2*i) = x_inp[i]; +#if VERBOSE > 100 + for (int i = 0; i < M_SIZE*N_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + + dst_addr = (uint32_t)X_BASE; + src_addr = (uint32_t)Z_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (X_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (Z_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + dst_std_2 = 0; + src_std_2 = 0; + reps_2 = 1; +#if VERBOSE > 10 + printf("dst_std_2: 0x%8x\n", dst_std_2); + printf("src_std_2: 0x%8x\n", src_std_2); + printf("reps_2: 0x%8x\n", reps_2); +#endif + + dst_std_3 = 0; + src_std_3 = 0; + reps_3 = 1; +#if VERBOSE > 10 + printf("dst_std_3: 0x%8x\n", dst_std_3); + printf("src_std_3: 0x%8x\n", src_std_3); + printf("reps_3: 0x%8x\n", reps_3); +#endif + + uint32_t transfer_id_1 = idma_L2ToL1(src_addr, dst_addr, len); + printf("iDMA moving data from L2 to L1...\n"); + + // Clear Event Unit and ensure A2O mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_A2O_DONE_MASK); + + if (USE_WFE) { + eu_idma_wait_a2o_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_idma_wait_a2o_completion(EU_WAIT_MODE_POLLING); + } + + dst_addr = (uint32_t)W_BASE; + src_addr = (uint32_t)X_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (W_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (X_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + dst_std_2 = 0; + src_std_2 = 0; + reps_2 = 1; +#if VERBOSE > 10 + printf("dst_std_2: 0x%8x\n", dst_std_2); + printf("src_std_2: 0x%8x\n", src_std_2); + printf("reps_2: 0x%8x\n", reps_2); +#endif + + dst_std_3 = 0; + src_std_3 = 0; + reps_3 = 1; +#if VERBOSE > 10 + printf("dst_std_3: 0x%8x\n", dst_std_3); + printf("src_std_3: 0x%8x\n", src_std_3); + printf("reps_3: 0x%8x\n", reps_3); +#endif + + uint32_t transfer_id_2 = idma_L1ToL2(src_addr, dst_addr, len); + + printf("iDMA moving data from L1 to L2...\n"); + + // Clear Event Unit and ensure O2A mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_O2A_DONE_MASK); + + if (USE_WFE) { + eu_idma_wait_o2a_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_idma_wait_o2a_completion(EU_WAIT_MODE_POLLING); + } + +#ifdef CONCURRENT + // Setup concurrent transfer L2->L1 to Y_BASE + dst_addr = (uint32_t)Y_BASE; + src_addr = (uint32_t)Z_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (Y_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (Z_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + // Start both transfers concurrently + uint32_t transfer_id_o2a = transfer_id_2; // OBI2AXI (L1->L2) already started + uint32_t transfer_id_a2o = idma_L2ToL1(src_addr, dst_addr, len); // Start AXI2OBI (L2->L1) + + printf("iDMA moving concurrently data from L1 to L2 and from L2 to L1...\n"); + + // Clear Event Unit and ensure both masks are enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_ALL_DONE_MASK); + + if (USE_WFE) { + eu_idma_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_idma_wait_completion(EU_WAIT_MODE_POLLING); + } +#endif + + printf("Verifying results...\n"); + + unsigned int num_errors = 0; + + uint16_t detected_l1, detected_l2, expected; + for(int i = 0; i < M_SIZE*N_SIZE; i++){ + detected_l2 = mmio16(W_BASE + 2*i); +#ifdef CONCURRENT + detected_l1 = mmio16(Y_BASE + 2*i); +#else + detected_l1 = mmio16(X_BASE + 2*i); +#endif + expected = mmio16(Z_BASE + 2*i); + if((detected_l2 != expected) || (detected_l1 != expected)){ + num_errors++; + printf("**ERROR**: DETECTED L2[%0d](=0x%4x) || DETECTED L1[%0d](=0x%4x) != EXPECTED[%0d](=0x%4x)\n", i, detected_l2, i, detected_l1, i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/mesh_test_event_unit.c b/sw/tests/eu_tests/mesh_test_event_unit.c new file mode 100644 index 0000000..aecc049 --- /dev/null +++ b/sw/tests/eu_tests/mesh_test_event_unit.c @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on mesh_test.c by Victor Isachi + * + * MAGIA Mesh Test - Pure Event Unit API Version + * Uses ONLY event_unit_utils for WFE/polling + * + * Configuration: + * - Set USE_WFE to 1 for WFE (Wait-For-Event) mode + * - Set USE_WFE to 0 for Event Unit polling mode + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "redmule_mm_utils.h" +#include "idma_mm_utils.h" +#include "event_unit_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 +#define V_BASE (L2_BASE + 0x00046000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 +#define T_BASE (L2_BASE + 0x0004A000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 + +#define MHARTID_OFFSET (0x00010000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +#define USE_WFE (0) + +void idma_mv_in_pure_eu(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uint32_t dst_address) { + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + // Initialize Event Unit once + static int eu_initialized = 0; + if (!eu_initialized) { + eu_init(); + eu_initialized = 1; + } + + for (int i = 0; i < x_dim*y_dim; i++) + mmio16(T_BASE + get_hartid()*MHARTID_OFFSET + 2*i) = src_data[i]; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)(T_BASE + get_hartid()*MHARTID_OFFSET); + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%0x\n", dst_addr); + printf("src_addr: 0x%0x\n", src_addr); + printf("len: %0d\n", len); +#endif + + idma_L2ToL1(src_addr, dst_addr, len); + + // Clear Event Unit and ensure A2O mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_A2O_DONE_MASK); + + // Use PURE Event Unit + eu_wait_mode_t wait_mode = USE_WFE ? EU_WAIT_MODE_WFE : EU_WAIT_MODE_POLLING; + + // Use direction-specific wait for L2->L1 (A2O, direction = 0) + eu_idma_wait_direction_completion(0, wait_mode); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++){ + printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); + } +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_addr + 2*i) != src_data[i]) { + num_errors++; + printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, src_data[i]); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +void idma_mv_out_pure_eu(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, uint32_t dst_address) { + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)src_address; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%0x\n", dst_addr); + printf("src_addr: 0x%0x\n", src_addr); + printf("len: %0d\n", len); +#endif + + idma_L1ToL2(src_addr, dst_addr, len); + + // Clear Event Unit and ensure O2A mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_O2A_DONE_MASK); + + // Use PURE Event Unit + eu_wait_mode_t wait_mode = USE_WFE ? EU_WAIT_MODE_WFE : EU_WAIT_MODE_POLLING; + + // Use direction-specific wait for L1->L2 (O2A, direction = 1) + eu_idma_wait_direction_completion(1, wait_mode); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++){ + printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); + } +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_addr + 2*i) != mmio16(src_addr + 2*i)) { + num_errors++; + printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, mmio16(src_addr + 2*i)); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +int main(void) { + + // X + printf("Initializing X through iDMA...\n"); + idma_mv_in_pure_eu(M_SIZE, N_SIZE, x_inp, (X_BASE + get_hartid()*L1_TILE_OFFSET)); + + // W + printf("Initializing W through iDMA...\n"); + idma_mv_in_pure_eu(N_SIZE, K_SIZE, w_inp, (W_BASE + get_hartid()*L1_TILE_OFFSET)); + + // Y + printf("Initializing Y through iDMA...\n"); + idma_mv_in_pure_eu(M_SIZE, K_SIZE, y_inp, (Y_BASE + get_hartid()*L1_TILE_OFFSET)); + +#if VERBOSE > 10 + printf("K_SIZE: 0x%0x\n", K_SIZE); + printf("M_SIZE: 0x%0x\n", M_SIZE); + printf("N_SIZE: 0x%0x\n", N_SIZE); +#endif + + printf("Testing matrix multiplication with RedMulE...\n"); + + // Initialize and configure RedMulE using MM approach + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)(X_BASE + get_hartid()*L1_TILE_OFFSET), + (unsigned int)(W_BASE + get_hartid()*L1_TILE_OFFSET), + (unsigned int)(Y_BASE + get_hartid()*L1_TILE_OFFSET), + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + hwpe_trigger_job(); + + // Clear Event Unit and ensure RedMulE mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_REDMULE_DONE_MASK); + + // Use PURE Event Unit + eu_wait_mode_t wait_mode = USE_WFE ? EU_WAIT_MODE_WFE : EU_WAIT_MODE_POLLING; + + // Wait for HWPE completion + eu_redmule_wait_completion(wait_mode); + + printf("Moving results through iDMA...\n"); + idma_mv_out_pure_eu(M_SIZE, K_SIZE, Y_BASE + get_hartid()*L1_TILE_OFFSET, V_BASE + get_hartid()*MHARTID_OFFSET); + + printf("Verifying results...\n"); + + unsigned int num_errors[NUM_HARTS]; + num_errors[get_hartid()] = 0; + + volatile uint16_t computed[NUM_HARTS], expected[NUM_HARTS], diff[NUM_HARTS]; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed[get_hartid()] = mmio16(V_BASE + get_hartid()*MHARTID_OFFSET + 2*i); + expected[get_hartid()] = z_oup[i]; + diff[get_hartid()] = (computed[get_hartid()] > expected[get_hartid()]) ? (computed[get_hartid()] - expected[get_hartid()]) : (expected[get_hartid()] - computed[get_hartid()]); + if(diff[get_hartid()] > DIFF_TH){ + num_errors[get_hartid()]++; + printf("**ERROR**: V[0x%0x](=0x%0x) != Z[%0d](=0x%0x)\n", V_BASE + get_hartid()*MHARTID_OFFSET + 2*i, computed[get_hartid()], i, expected[get_hartid()]); + } + } + printf("Finished test with %0d error(s)\n", num_errors[get_hartid()]); + + return num_errors[get_hartid()]; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/redmule_test_event_unit.c b/sw/tests/eu_tests/redmule_test_event_unit.c new file mode 100644 index 0000000..5efe2d0 --- /dev/null +++ b/sw/tests/eu_tests/redmule_test_event_unit.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on redmule_test.c by Victor Isachi + * + * RedMulE Matrix Multiplication Test with Event Unit WFE API + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * + */ + +#include +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" +#include "event_unit_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define USE_WFE (1) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +int main(void) { + // X + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(X_BASE + 2*i) = x_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*N_SIZE; i++) + printf("X[%8x]: 0x%4x\n", X_BASE + 2*i, mmio16(X_BASE + 2*i)); +#endif + + // W + for (int i = 0; i < N_SIZE*K_SIZE; i++) + mmio16(W_BASE + 2*i) = w_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < N_SIZE*K_SIZE; i++) + printf("W[%8x]: 0x%4x\n", W_BASE + 2*i, mmio16(W_BASE + 2*i)); +#endif + +// Y + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Y_BASE + 2*i) = y_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Y[%8x]: 0x%4x\n", Y_BASE + 2*i, mmio16(Y_BASE + 2*i)); +#endif + + // Z - golden (reference) + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Z_BASE + 2*i) = z_oup[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + +#if VERBOSE > 10 + printf("K_SIZE: %4x\n", K_SIZE); + printf("M_SIZE: %4x\n", M_SIZE); + printf("N_SIZE: %4x\n", N_SIZE); +#endif + + // Initialize and configure RedMulE + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE, (unsigned int)W_BASE, (unsigned int)Y_BASE, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + // Initialize Event Unit for RedMulE + eu_redmule_init(); + + // Wait for end of computation + printf("Testing matrix multiplication with RedMulE...\n"); + hwpe_trigger_job(); + + // Wait for HWPE completion using Event Unit + if (USE_WFE) { + eu_redmule_wait_completion(EU_WAIT_MODE_WFE); + printf("Detected WFE...\n"); + } else { + eu_redmule_wait_completion(EU_WAIT_MODE_POLLING); + printf("Detected polling completion...\n"); + } + printf("Verifying results...\n"); + + // Disable RedMulE + hwpe_cg_disable(); + + unsigned int num_errors = 0; + + uint16_t computed, expected, diff; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed = mmio16(Y_BASE + 2*i); + expected = mmio16(Z_BASE + 2*i); + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if(diff > DIFF_TH){ + num_errors++; + printf("**ERROR**: Y[%8x](=0x%4x) != Z[%8x](=0x%4x)\n", Y_BASE + 2*i, computed, Z_BASE + 2*i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/eu_tests/tile_test_event_unit.c b/sw/tests/eu_tests/tile_test_event_unit.c new file mode 100644 index 0000000..f71a4e7 --- /dev/null +++ b/sw/tests/eu_tests/tile_test_event_unit.c @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on tile_test.c by Victor Isachi + * + * MAGIA Tile Test - Event Unit WFE API Version + * Uses event_unit_utils.h for Event Unit control and WFE/polling + * + */ + +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" +#include "idma_mm_utils.h" +#include "event_unit_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) +#define V_BASE (L2_BASE + 0x00046000) +#define T_BASE (L2_BASE + 0x0004A000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define USE_WFE (0) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +#define CONCURRENT + +void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + // Initialize Event Unit once + static int eu_initialized = 0; + if (!eu_initialized) { + eu_init(); + eu_initialized = 1; + } + + for (int i = 0; i < x_dim*y_dim; i++) + mmio16(T_BASE + 2*i) = src_data[i]; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)T_BASE; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x\n", dst_addr); + printf("src_addr: 0x%8x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L2ToL1(src_addr, dst_addr, len); + + // Clear Event Unit and ensure A2O mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_A2O_DONE_MASK); + + if (USE_WFE) { + eu_idma_wait_a2o_completion(EU_WAIT_MODE_WFE); + } else { + eu_idma_wait_a2o_completion(EU_WAIT_MODE_POLLING); + } + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++) + printf("DST[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i)); +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_address + 2*i) != src_data[i]) { + num_errors++; + printf("DST[%8x]: 0x%4x != SRC[%0d]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i), i, src_data[i]); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)src_address; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x\n", dst_addr); + printf("src_addr: 0x%8x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L1ToL2(src_addr, dst_addr, len); + + // Clear Event Unit and ensure O2A mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_O2A_DONE_MASK); + + if (USE_WFE) { + eu_idma_wait_o2a_completion(EU_WAIT_MODE_WFE); + } else { + eu_idma_wait_o2a_completion(EU_WAIT_MODE_POLLING); + } + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++) + printf("DST[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i)); +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_address + 2*i) != mmio16(src_address + 2*i)) { + num_errors++; + printf("DST[%8x]: 0x%4x != SRC[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i), src_address + 2*i, mmio16(src_address + 2*i)); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +int main(void) { + // X + printf("Initializing X through iDMA...\n"); + idma_mv_in(M_SIZE, N_SIZE, x_inp, X_BASE); + + // W + printf("Initializing W through iDMA...\n"); + idma_mv_in(N_SIZE, K_SIZE, w_inp, W_BASE); + + // Y + printf("Initializing Y through iDMA...\n"); + idma_mv_in(M_SIZE, K_SIZE, y_inp, Y_BASE); + + // Z - golden (reference) + printf("Initializing Z - golden...\n"); + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Z_BASE + 2*i) = z_oup[i]; +#if VERBOSE > 100 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + +#if VERBOSE > 10 + printf("K_SIZE: %4x\n", K_SIZE); + printf("M_SIZE: %4x\n", M_SIZE); + printf("N_SIZE: %4x\n", N_SIZE); +#endif + + // Initialize and configure RedMulE using MM approach + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE, (unsigned int)W_BASE, (unsigned int)Y_BASE, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + // Initialize Event Unit for RedMulE + eu_redmule_init(); + + printf("Testing matrix multiplication with RedMulE...\n"); + hwpe_trigger_job(); + + // Wait for HWPE completion using Event Unit + if (USE_WFE) { + eu_redmule_wait_completion(EU_WAIT_MODE_WFE); + } else { + eu_redmule_wait_completion(EU_WAIT_MODE_POLLING); + } + + printf("Moving results through iDMA...\n"); + idma_mv_out(M_SIZE, K_SIZE, Y_BASE, V_BASE); + + printf("Verifying results...\n"); + + unsigned int num_errors = 0; + + uint16_t computed, expected, diff; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed = mmio16(V_BASE + 2*i); + expected = mmio16(Z_BASE + 2*i); + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if(diff > DIFF_TH){ + num_errors++; + printf("**ERROR**: V[%8x](=0x%4x) != Z[%8x](=0x%4x)\n", V_BASE + 2*i, computed, Z_BASE + 2*i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/fpu_test.c b/sw/tests/fpu_test.c index e37042d..83379b7 100644 --- a/sw/tests/fpu_test.c +++ b/sw/tests/fpu_test.c @@ -39,30 +39,26 @@ inline uint32_t f_add(volatile uint32_t op_a, volatile uint32_t op_b){ } int main(void) { - // uint32_t exit_code; + uint32_t exit_code; - // volatile float a, b, c; - // a = A_VAL; - // b = B_VAL; - // c = a+b; - - // if (abs_diff(c, C_EXP) > FP_TH){ - // exit_code = FAIL_EXIT_CODE; - // printf("Test FAILED\n"); - // }else{ - // exit_code = PASS_EXIT_CODE; - // printf("Test PASSED\n"); - // } - - // mmio16(TEST_END_ADDR) = exit_code; +#ifndef CV32E40X + volatile float a, b, c; + a = A_VAL; + b = B_VAL; + c = a+b; + if (abs_diff(c, C_EXP) > FP_TH){ + printf("Test FAILED\n"); + }else{ + printf("Test PASSED\n"); + } +#else uint32_t a, b, c; a = 0x414570A4; // Binary for 12.34f b = 0x42631EB8; // Binary for 56.78f c = f_add(a, b); printf("Float operation result: 0x%0x [expected: 0x428A3D71(69.12f)]\n", c); - - mmio16(TEST_END_ADDR) = DEFAULT_EXIT_CODE; +#endif return 0; } diff --git a/sw/tests/hello_mesh.c b/sw/tests/hello_mesh.c index f1ff565..48e5c5d 100644 --- a/sw/tests/hello_mesh.c +++ b/sw/tests/hello_mesh.c @@ -24,7 +24,7 @@ int main(void) { // h_pprintf("Hello World! it is hartid "); pprintf(ds(get_hartid())); pprintln; - printf("Hello World! it is hartid %0d\n", get_hartid()); + printf("Hello World! it is tile/hart %0d\n", get_hartid()); return 0; } diff --git a/sw/tests/inter_l1_test.c b/sw/tests/inter_l1_test.c index 41eec0c..ac3f60f 100644 --- a/sw/tests/inter_l1_test.c +++ b/sw/tests/inter_l1_test.c @@ -52,8 +52,8 @@ int main() { if (get_hartid() == 0) { for (int i = 0; i < NUM_HARTS; i++) if (error[i]) total_errors++; - if (total_errors) { /*h_pprintf("TEST FAILED!!"); pprintln;*/ printf("TEST FAILED!!"); } - else { /*h_pprintf("TEST PASSED!!"); pprintln;*/ printf("TEST PASSED!!"); } + if (total_errors) { /*h_pprintf("TEST FAILED!!"); pprintln;*/ printf("TEST FAILED!!\n"); } + else { /*h_pprintf("TEST PASSED!!"); pprintln;*/ printf("TEST PASSED!!\n"); } } else wait_nop(SETTLE_CYCLE); return total_errors; diff --git a/sw/tests/mm_tests/fsync_extended_test_mm.c b/sw/tests/mm_tests/fsync_extended_test_mm.c new file mode 100644 index 0000000..ded77fc --- /dev/null +++ b/sw/tests/mm_tests/fsync_extended_test_mm.c @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_extended_test.c by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Synchronization Test + * WARNING: //STALLING MODE = POLLING, //NONSTALLING = WFI currently not working for race conditions + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "fsync_mm_utils.h" +#include "fsync_mm_api.h" +#include "cache_fill.h" + +#define VERBOSE (0) + + +int main(void) { + uint32_t aggregates[NUM_HARTS]; + uint32_t ids[NUM_HARTS]; + + +#if NUM_HARTS == 16 + /// Custom 4x4 synch. + switch (get_hartid()){ + case 0: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 1: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 2: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 3: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 4: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 5: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 6: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 7: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 8: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 9: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 10: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 11: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 12: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 13: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 14: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + case 15: aggregates[get_hartid()] = 0b1111; ids[get_hartid()] = 7; break; + } + + // h_pprintf("FractalSync aggregate: 0b"); pprintf(bs(aggregates[get_hartid()])); pprintf(", id: "); pprintf(ds(ids[get_hartid()])); n_pprintf("..."); + printf("FractalSync aggregate: 0x%0x, id: %0d...\n", aggregates[get_hartid()], ids[get_hartid()]); + + fsync_mm(ids[get_hartid()], aggregates[get_hartid()]); + + + sentinel_instr_id(); +#endif + + printf("[FractalSync MM] Horizontal neighbor test starting\n"); + fsync_mm_hnbr(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Horizontal neighbor test ending\n"); + + printf("[FractalSync MM] Horizontal ring neighbor test starting\n"); + fsync_mm_hring(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Horizontal ring neighbor test ending\n"); + + printf("[FractalSync MM] Vertical neighbor test starting\n"); + fsync_mm_vnbr(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Vertical neighbor test ending\n"); + + printf("[FractalSync MM] Vertical ring neighbor test starting\n"); + fsync_mm_vring(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Vertical ring neighbor test ending\n"); + + printf("[FractalSync MM] Row test starting\n"); + fsync_mm_rows(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Row test ending\n"); + + printf("[FractalSync MM] Column test starting\n"); + fsync_mm_cols(); + + sentinel_instr_id(); + printf("[FractalSync MM] Column test ending\n"); + + printf("[FractalSync MM] Global test starting\n"); + fsync_mm_global(); + + + sentinel_instr_id(); + printf("[FractalSync MM] Global test ending\n"); + + // h_pprintf("FractalSync test finished...\n"); + printf("FractalSync MM test finished...\n"); + + return 0; +} \ No newline at end of file diff --git a/sw/tests/mm_tests/fsync_test_mm.c b/sw/tests/mm_tests/fsync_test_mm.c new file mode 100644 index 0000000..aae62a7 --- /dev/null +++ b/sw/tests/mm_tests/fsync_test_mm.c @@ -0,0 +1,271 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and Unive printf("Starting Fractal Sync test...\n");sity of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_test.c by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Synchronization Test + * WARNING: //STALLING = POLLING, //NONSTALLING = WFI not working in this new version of the tile with event unit + * WARNING: Make sure to undefine EVENT_UNIT in fsync_mm_utils.h, otherwise test will proceed without correctly waiting for sync completion + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "fsync_mm_utils.h" +#include "fsync_mm_api.h" +#include "cache_fill.h" + +#define VERBOSE (0) + +#define CLIB_FS_MM_TEST +// #define GLOBAL_FS_MM_TEST +// #define ROW_FS_MM_TEST +// #define COL_FS_MM_TEST +// #define HNBR_FS_MM_TEST +// #define VNBR_FS_MM_TEST +// #define HRING_FS_MM_TEST +// #define VRING_FS_MM_TEST + +#define NUM_LEVELS (31-__builtin_clz(NUM_HARTS)) + + +#define CACHE_HEAT_CYCLES (3) + +int main(void) { + uint32_t tile_hartid = get_hartid(); + uint32_t tile_xhartid = GET_X_ID(tile_hartid); + uint32_t tile_yhartid = GET_Y_ID(tile_hartid); + + printf("Starting Fractal Sync Memory-Mapped test...\n"); + + // Filling up the cache + fill_icache(); + + // Execute synchronization multiple times to pre-heat the cache + for (int i = 0; i < CACHE_HEAT_CYCLES; i++) { +#ifdef CLIB_FS_MM_TEST + // Climb FS tree test using memory-mapped interface + for (int i = 0; i < NUM_LEVELS; i++){ + printf("Fractal Sync at level %0d...\n", i+1); + + + uint32_t aggregates = (1 << (i+1))-1; + uint32_t ids = 0; +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", aggregates); + printf("id: 0x%0x\n", ids); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm(ids, aggregates); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + + printf("Synchronized...\n"); + } +#endif + +#ifdef GLOBAL_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync global synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_GLOBAL_AGGR); + printf("id: 0x%0x\n", _FS_MM_GLOBAL_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_global(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef HNBR_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync horizontal neighbor synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_HNBR_AGGR); + printf("id: 0x%0x\n", _FS_MM_HNBR_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_hnbr(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef VNBR_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync vertical neighbor synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + printf("aggregate: 0x%0x\n", _FS_MM_VNBR_AGGR); + printf("id: 0x%0x\n", _FS_MM_VNBR_ID); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_vnbr(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef HRING_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync horizontal ring synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + if ((tile_xhartid == 0) || (tile_xhartid == MESH_X_TILES-1)){ + uint32_t id = row_id_lookup_mm(tile_yhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_LVL); + printf("id: 0x%0x\n", id); + } else { + printf("aggregate: 0x%0x\n", _FS_MM_HRING_AGGR); + printf("id: 0x%0x\n", _FS_MM_HRING_ID); + } +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_hring(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef VRING_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync vertical ring synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + if ((tile_yhartid == 0) || (tile_yhartid == MESH_Y_TILES-1)){ + uint32_t id = col_id_lookup_mm(tile_xhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_LVL); + printf("id: 0x%0x\n", id); + } else { + printf("aggregate: 0x%0x\n", _FS_MM_VRING_AGGR); + printf("id: 0x%0x\n", _FS_MM_VRING_ID); + } +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_vring(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef ROW_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync row synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + uint32_t id = row_id_lookup_mm(tile_yhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_AGGR); + printf("id: 0x%0x\n", id); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_rows(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + +#ifdef COL_FS_MM_TEST +#if VERBOSE > 1 + printf("Fractal Sync column synchronization test...\n"); +#endif + + +#if VERBOSE > 10 + uint32_t id = col_id_lookup_mm(tile_xhartid); + printf("aggregate: 0x%0x\n", _FS_MM_RC_AGGR); + printf("id: 0x%0x\n", id); +#endif + + // Instruction immediately preceding synchronization: indicates start of the synchronization region + sentinel_start(); + + fsync_mm_cols(); + + // Instruction immediately following synchronization: indicates end of the synchronization region + sentinel_end(); + +#if VERBOSE > 1 + printf("Synchronized...\n"); +#endif +#endif + } + + printf("Fractal Sync Memory-Mapped test finished...\n"); + + + return 0; +} \ No newline at end of file diff --git a/sw/tests/mm_tests/idma_test_mm.c b/sw/tests/mm_tests/idma_test_mm.c new file mode 100644 index 0000000..07554c9 --- /dev/null +++ b/sw/tests/mm_tests/idma_test_mm.c @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on idma_test.c by Victor Isachi + * + * MAGIA iDMA Test using Memory-Mapped Control + */ + +#include "magia_tile_utils.h" +#include "idma_mm_utils.h" + +#include "x_input.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define Y_BASE (L1_BASE + 0x00016048) +#define Z_BASE (L2_BASE + 0x00001000) +#define W_BASE (L2_BASE + 0x00005000) + +#define M_SIZE (96) +#define N_SIZE (64) + +#define VERBOSE (0) + +#define WAIT_CYCLES (10) + +#define CONCURRENT + +int main(void) { + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + uint32_t dst_std_2; + uint32_t src_std_2; + uint32_t reps_2; + + uint32_t dst_std_3; + uint32_t src_std_3; + uint32_t reps_3; + + // Z - golden (reference) + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(Z_BASE + 2*i) = x_inp[i]; +#if VERBOSE > 100 + for (int i = 0; i < M_SIZE*N_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + + dst_addr = (uint32_t)X_BASE; + src_addr = (uint32_t)Z_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (X_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (Z_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + dst_std_2 = 0; + src_std_2 = 0; + reps_2 = 1; +#if VERBOSE > 10 + printf("dst_std_2: 0x%8x\n", dst_std_2); + printf("src_std_2: 0x%8x\n", src_std_2); + printf("reps_2: 0x%8x\n", reps_2); +#endif + + dst_std_3 = 0; + src_std_3 = 0; + reps_3 = 1; +#if VERBOSE > 10 + printf("dst_std_3: 0x%8x\n", dst_std_3); + printf("src_std_3: 0x%8x\n", src_std_3); + printf("reps_3: 0x%8x\n", reps_3); +#endif + + uint32_t transfer_id_1 = idma_L2ToL1(src_addr, dst_addr, len); + printf("iDMA moving data from L2 to L1...\n"); + + // Use polling to wait for completion + dma_wait(transfer_id_1); + + dst_addr = (uint32_t)W_BASE; + src_addr = (uint32_t)X_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (W_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (X_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + dst_std_2 = 0; + src_std_2 = 0; + reps_2 = 1; +#if VERBOSE > 10 + printf("dst_std_2: 0x%8x\n", dst_std_2); + printf("src_std_2: 0x%8x\n", src_std_2); + printf("reps_2: 0x%8x\n", reps_2); +#endif + + dst_std_3 = 0; + src_std_3 = 0; + reps_3 = 1; +#if VERBOSE > 10 + printf("dst_std_3: 0x%8x\n", dst_std_3); + printf("src_std_3: 0x%8x\n", src_std_3); + printf("reps_3: 0x%8x\n", reps_3); +#endif + + uint32_t transfer_id_2 = idma_L1ToL2(src_addr, dst_addr, len); + + printf("iDMA moving data from L1 to L2...\n"); + + // Use polling to wait for completion + dma_wait(transfer_id_2); + +#ifdef CONCURRENT + // Setup concurrent transfer L2->L1 to Y_BASE + dst_addr = (uint32_t)Y_BASE; + src_addr = (uint32_t)Z_BASE; + len = (uint32_t)(M_SIZE*N_SIZE*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x (Y_BASE)\n", dst_addr); + printf("src_addr: 0x%8x (Z_BASE)\n", src_addr); + printf("len: %0d\n", len); +#endif + + // Start both transfers concurrently + uint32_t transfer_id_o2a = transfer_id_2; // OBI2AXI (L1->L2) already started + uint32_t transfer_id_a2o = idma_L2ToL1(src_addr, dst_addr, len); // Start AXI2OBI (L2->L1) + + printf("iDMA moving concurrently data from L1 to L2 and from L2 to L1...\n"); + + // Use polling to wait for both transfers completion + dma_wait(transfer_id_o2a); + dma_wait(transfer_id_a2o); +#else + // Single transfer mode + + // Use polling to wait for completion + dma_wait(transfer_id_2); +#endif + + printf("Verifying results...\n"); + + unsigned int num_errors = 0; + + uint16_t detected_l1, detected_l2, expected; + for(int i = 0; i < M_SIZE*N_SIZE; i++){ + detected_l2 = mmio16(W_BASE + 2*i); +#ifdef CONCURRENT + detected_l1 = mmio16(Y_BASE + 2*i); +#else + detected_l1 = mmio16(X_BASE + 2*i); +#endif + expected = mmio16(Z_BASE + 2*i); + if((detected_l2 != expected) || (detected_l1 != expected)){ + num_errors++; + printf("**ERROR**: DETECTED L2[%0d](=0x%4x) || DETECTED L1[%0d](=0x%4x) != EXPECTED[%0d](=0x%4x)\n", i, detected_l2, i, detected_l1, i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/mm_tests/mesh_test_mm.c b/sw/tests/mm_tests/mesh_test_mm.c new file mode 100644 index 0000000..919bf95 --- /dev/null +++ b/sw/tests/mm_tests/mesh_test_mm.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on mesh_test.c by Victor Isachi + * + * MAGIA Mesh Test - Memory Mapped Version + */ + +#include "magia_tile_utils.h" +#include "magia_utils.h" +#include "redmule_mm_utils.h" +#include "idma_mm_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 +#define V_BASE (L2_BASE + 0x00046000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 +#define T_BASE (L2_BASE + 0x0004A000) // Note: for a large number of tiles (e.g. 64x64 mesh) we might exceed memory range of L2 + +#define MHARTID_OFFSET (0x00010000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +#define CONCURRENT + +void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + for (int i = 0; i < x_dim*y_dim; i++) + mmio16(T_BASE + get_hartid()*MHARTID_OFFSET + 2*i) = src_data[i]; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)(T_BASE + get_hartid()*MHARTID_OFFSET); + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%0x\n", dst_addr); + printf("src_addr: 0x%0x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L2ToL1(src_addr, dst_addr, len); + + dma_wait(transfer_id); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++){ + printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); + } +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_addr + 2*i) != src_data[i]) { + num_errors++; + printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, src_data[i]); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)src_address; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%0x\n", dst_addr); + printf("src_addr: 0x%0x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L1ToL2(src_addr, dst_addr, len); + + dma_wait(transfer_id); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++){ + printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); + } +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_addr + 2*i) != mmio16(src_addr + 2*i)) { + num_errors++; + printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, mmio16(src_addr + 2*i)); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +int main(void) { + // X + printf("Initializing X through iDMA...\n"); + idma_mv_in(M_SIZE, N_SIZE, x_inp, (X_BASE + get_hartid()*L1_TILE_OFFSET)); + + // W + printf("Initializing W through iDMA...\n"); + idma_mv_in(N_SIZE, K_SIZE, w_inp, (W_BASE + get_hartid()*L1_TILE_OFFSET)); + + // Y + printf("Initializing Y through iDMA...\n"); + idma_mv_in(M_SIZE, K_SIZE, y_inp, (Y_BASE + get_hartid()*L1_TILE_OFFSET)); + +#if VERBOSE > 10 + printf("K_SIZE: 0x%0x\n", K_SIZE); + printf("M_SIZE: 0x%0x\n", M_SIZE); + printf("N_SIZE: 0x%0x\n", N_SIZE); +#endif + + printf("Testing matrix multiplication with RedMulE...\n"); + + // Initialize and configure RedMulE using MM approach + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)(X_BASE + get_hartid()*L1_TILE_OFFSET), + (unsigned int)(W_BASE + get_hartid()*L1_TILE_OFFSET), + (unsigned int)(Y_BASE + get_hartid()*L1_TILE_OFFSET), + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + hwpe_trigger_job(); + + // Wait for HWPE completion + hwpe_wait_for_completion(); + + printf("Moving results through iDMA...\n"); + idma_mv_out(M_SIZE, K_SIZE, Y_BASE + get_hartid()*L1_TILE_OFFSET, V_BASE + get_hartid()*MHARTID_OFFSET); + + printf("Verifying results...\n"); + + unsigned int num_errors[NUM_HARTS]; + num_errors[get_hartid()] = 0; + + uint16_t computed[NUM_HARTS], expected[NUM_HARTS], diff[NUM_HARTS]; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed[get_hartid()] = mmio16(V_BASE + get_hartid()*MHARTID_OFFSET + 2*i); + expected[get_hartid()] = z_oup[i]; + diff[get_hartid()] = (computed[get_hartid()] > expected[get_hartid()]) ? (computed[get_hartid()] - expected[get_hartid()]) : (expected[get_hartid()] - computed[get_hartid()]); + if(diff[get_hartid()] > DIFF_TH){ + num_errors[get_hartid()]++; + printf("**ERROR**: V[0x%0x](=0x%0x) != Z[%0d](=0x%0x)\n", V_BASE + get_hartid()*MHARTID_OFFSET + 2*i, computed[get_hartid()], i, expected[get_hartid()]); + } + } + printf("Finished test with %0d error(s)\n", num_errors[get_hartid()]); + + return num_errors[get_hartid()]; +} \ No newline at end of file diff --git a/sw/tests/mm_tests/redmule_test_mm.c b/sw/tests/mm_tests/redmule_test_mm.c new file mode 100644 index 0000000..eea3abf --- /dev/null +++ b/sw/tests/mm_tests/redmule_test_mm.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on redmule_test.c by Victor Isachi + * + * RedMulE Matrix Multiplication Test with MMIO HWPE Control Functions + * + * This test uses MMIO functions for HWPE control and simplified IRQ management + * copied from redmule_test.c for better reliability. + */ + +#include +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +int main(void) { + // X + for (int i = 0; i < M_SIZE*N_SIZE; i++) + mmio16(X_BASE + 2*i) = x_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*N_SIZE; i++) + printf("X[%8x]: 0x%4x\n", X_BASE + 2*i, mmio16(X_BASE + 2*i)); +#endif + + // W + for (int i = 0; i < N_SIZE*K_SIZE; i++) + mmio16(W_BASE + 2*i) = w_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < N_SIZE*K_SIZE; i++) + printf("W[%8x]: 0x%4x\n", W_BASE + 2*i, mmio16(W_BASE + 2*i)); +#endif + +// Y + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Y_BASE + 2*i) = y_inp[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Y[%8x]: 0x%4x\n", Y_BASE + 2*i, mmio16(Y_BASE + 2*i)); +#endif + + // Z - golden (reference) + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Z_BASE + 2*i) = z_oup[i]; +#if VERBOSE > 10 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + +#if VERBOSE > 10 + printf("K_SIZE: %4x\n", K_SIZE); + printf("M_SIZE: %4x\n", M_SIZE); + printf("N_SIZE: %4x\n", N_SIZE); +#endif + + // Initialize and configure RedMulE + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE, (unsigned int)W_BASE, (unsigned int)Y_BASE, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + // Wait for end of computation + printf("Testing matrix multiplication with RedMulE...\n"); + hwpe_trigger_job(); + + // Wait for HWPE completion + hwpe_wait_for_completion(); + + printf("Verifying results...\n"); + + // Disable RedMulE + hwpe_cg_disable(); + + unsigned int num_errors = 0; + + uint16_t computed, expected, diff; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed = mmio16(Y_BASE + 2*i); + expected = mmio16(Z_BASE + 2*i); + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if(diff > DIFF_TH){ + num_errors++; + printf("**ERROR**: Y[%8x](=0x%4x) != Z[%8x](=0x%4x)\n", Y_BASE + 2*i, computed, Z_BASE + 2*i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + return num_errors; +} diff --git a/sw/tests/mm_tests/tile_test_mm.c b/sw/tests/mm_tests/tile_test_mm.c new file mode 100644 index 0000000..a79e92d --- /dev/null +++ b/sw/tests/mm_tests/tile_test_mm.c @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on tile_test.c by Victor Isachi + * + * MAGIA Tile Test - Memory Mapped Version + */ + +#include "magia_tile_utils.h" +#include "redmule_mm_utils.h" +#include "idma_mm_utils.h" + +#include "x_input.h" +#include "w_input.h" +#include "y_input.h" +#include "z_output.h" + +#define X_BASE (L1_BASE + 0x00012048) +#define W_BASE (L1_BASE + 0x00016048) +#define Y_BASE (L1_BASE + 0x0001A048) +#define Z_BASE (L2_BASE + 0x00042000) +#define V_BASE (L2_BASE + 0x00046000) +#define T_BASE (L2_BASE + 0x0004A000) + +#define M_SIZE (96) +#define N_SIZE (64) +#define K_SIZE (64) + +#define VERBOSE (0) + +#define WAIT_CYCLES (10) + +#define DIFF_TH (0x0011) + +#define CONCURRENT + + +void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + for (int i = 0; i < x_dim*y_dim; i++) + mmio16(T_BASE + 2*i) = src_data[i]; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)T_BASE; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x\n", dst_addr); + printf("src_addr: 0x%8x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L2ToL1(src_addr, dst_addr, len); + + dma_wait(transfer_id); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++) + printf("DST[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i)); +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_address + 2*i) != src_data[i]) { + num_errors++; + printf("DST[%8x]: 0x%4x != SRC[%0d]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i), i, src_data[i]); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, uint32_t dst_address){ + uint32_t dst_addr; + uint32_t src_addr; + uint32_t len; + + dst_addr = (uint32_t)dst_address; + src_addr = (uint32_t)src_address; + len = (uint32_t)(x_dim*y_dim*2); // 2 Bytes per element +#if VERBOSE > 10 + printf("dst_addr: 0x%8x\n", dst_addr); + printf("src_addr: 0x%8x\n", src_addr); + printf("len: %0d\n", len); +#endif + + uint32_t transfer_id = idma_L1ToL2(src_addr, dst_addr, len); + + dma_wait(transfer_id); + +#if VERBOSE > 100 + for (int i = 0; i < x_dim*y_dim; i++) + printf("DST[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i)); +#endif + +#if VERBOSE > 10 + unsigned int num_errors; + num_errors = 0; + for (int i = 0; i < x_dim*y_dim; i++) { + if (mmio16(dst_address + 2*i) != mmio16(src_address + 2*i)) { + num_errors++; + printf("DST[%8x]: 0x%4x != SRC[%8x]: 0x%4x\n", dst_address + 2*i, mmio16(dst_address + 2*i), src_address + 2*i, mmio16(src_address + 2*i)); + } + } + printf("Detected %0d error(s) in the transfer...\n", num_errors); +#endif +} + +int main(void) { + // X + printf("Initializing X through iDMA...\n"); + idma_mv_in(M_SIZE, N_SIZE, x_inp, X_BASE); + + // W + printf("Initializing W through iDMA...\n"); + idma_mv_in(N_SIZE, K_SIZE, w_inp, W_BASE); + + // Y + printf("Initializing Y through iDMA...\n"); + idma_mv_in(M_SIZE, K_SIZE, y_inp, Y_BASE); + + // Z - golden (reference) + printf("Initializing Z - golden...\n"); + for (int i = 0; i < M_SIZE*K_SIZE; i++) + mmio16(Z_BASE + 2*i) = z_oup[i]; +#if VERBOSE > 100 + for (int i = 0; i < M_SIZE*K_SIZE; i++) + printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); +#endif + +#if VERBOSE > 10 + printf("K_SIZE: %4x\n", K_SIZE); + printf("M_SIZE: %4x\n", M_SIZE); + printf("N_SIZE: %4x\n", N_SIZE); +#endif + + // Initialize and configure RedMulE using MM approach + hwpe_cg_enable(); + hwpe_soft_clear(); + + int offload_id_tmp; + while ((offload_id_tmp = hwpe_acquire_job()) < 0) + ; + + redmule_cfg((unsigned int)X_BASE, (unsigned int)W_BASE, (unsigned int)Y_BASE, + M_SIZE, N_SIZE, K_SIZE, (uint8_t)gemm_ops, (uint8_t)Float16); + + printf("Testing matrix multiplication with RedMulE...\n"); + hwpe_trigger_job(); + + // Wait for HWPE completion + hwpe_wait_for_completion(); + + printf("Moving results through iDMA...\n"); + idma_mv_out(M_SIZE, K_SIZE, Y_BASE, V_BASE); + + printf("Verifying results...\n"); + + unsigned int num_errors = 0; + + uint16_t computed, expected, diff; + for(int i = 0; i < M_SIZE*K_SIZE; i++){ + computed = mmio16(V_BASE + 2*i); + expected = mmio16(Z_BASE + 2*i); + diff = (computed > expected) ? (computed - expected) : (expected - computed); + if(diff > DIFF_TH){ + num_errors++; + printf("**ERROR**: V[%8x](=0x%4x) != Z[%8x](=0x%4x)\n", V_BASE + 2*i, computed, Z_BASE + 2*i, expected); + } + } + printf("Finished test with %0d errors\n", num_errors); + + return num_errors; +} \ No newline at end of file diff --git a/sw/tests/fsync_extended_test.c b/sw/tests/xif_tests/fsync_extended_test.c similarity index 73% rename from sw/tests/fsync_extended_test.c rename to sw/tests/xif_tests/fsync_extended_test.c index 675ff06..4a1064f 100644 --- a/sw/tests/fsync_extended_test.c +++ b/sw/tests/xif_tests/fsync_extended_test.c @@ -23,19 +23,17 @@ #include "magia_utils.h" #include "fsync_isa_utils.h" #include "fsync_api.h" +#include "event_unit_utils.h" #include "cache_fill.h" #define VERBOSE (0) -#define STALLING - int main(void) { uint32_t aggregates[NUM_HARTS]; uint32_t ids[NUM_HARTS]; -#ifndef STALLING - irq_en(1< 10 printf("aggregate: 0x%0x\n", _FS_GLOBAL_AGGR); printf("id: 0x%0x\n", _FS_GLOBAL_ID); @@ -103,11 +96,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_global(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -122,10 +114,6 @@ int main(void) { printf("Fractal Sync horizontal neighbor synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 printf("aggregate: 0x%0x\n", _FS_HNBR_AGGR); printf("id: 0x%0x\n", _FS_HNBR_ID); @@ -134,11 +122,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_hnbr(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -153,10 +140,6 @@ int main(void) { printf("Fractal Sync vertical neighbor synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 printf("aggregate: 0x%0x\n", _FS_VNBR_AGGR); printf("id: 0x%0x\n", _FS_VNBR_ID); @@ -165,11 +148,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_vnbr(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -184,10 +166,6 @@ int main(void) { printf("Fractal Sync horizontal ring synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 if ((tile_xhartid == 0) || (tile_xhartid == MESH_X_TILES-1)){ uint32_t id = row_id_lookup(tile_yhartid); @@ -202,11 +180,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_hring(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -221,10 +198,6 @@ int main(void) { printf("Fractal Sync vertical ring synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 if ((tile_yhartid == 0) || (tile_yhartid == MESH_Y_TILES-1)){ uint32_t id = col_id_lookup(tile_xhartid); @@ -239,11 +212,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_vring(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -258,10 +230,6 @@ int main(void) { printf("Fractal Sync row synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 uint32_t id = row_id_lookup(tile_yhartid); printf("aggregate: 0x%0x\n", _FS_RC_AGGR); @@ -271,11 +239,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_rows(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); @@ -290,10 +257,6 @@ int main(void) { printf("Fractal Sync column synchrnonization test...\n"); #endif -#ifndef STALLING - irq_en(1< 10 uint32_t id = col_id_lookup(tile_xhartid); printf("aggregate: 0x%0x\n", _FS_RC_AGGR); @@ -303,11 +266,10 @@ int main(void) { // Instruction immediately preceding synchronization: indicates start of the synchronization region sentinel_start(); + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_DONE_MASK); fsync_cols(); -#ifndef STALLING - asm volatile("wfi" ::: "memory"); - printf("Detected IRQ...\n"); -#endif + eu_fsync_wait_completion(EU_WAIT_MODE_POLLING); // Instruction immediately following synchronization: indicates end of the synchronization region sentinel_end(); diff --git a/sw/tests/idma_test.c b/sw/tests/xif_tests/idma_test.c similarity index 91% rename from sw/tests/idma_test.c rename to sw/tests/xif_tests/idma_test.c index 0811fbb..8f85b7f 100644 --- a/sw/tests/idma_test.c +++ b/sw/tests/xif_tests/idma_test.c @@ -21,6 +21,7 @@ #include "magia_tile_utils.h" #include "idma_isa_utils.h" +#include "event_unit_utils.h" #include "x_input.h" @@ -34,8 +35,6 @@ #define VERBOSE (0) -#define IRQ_EN - #define WAIT_CYCLES (10) #define CONCURRENT @@ -53,6 +52,9 @@ int main(void) { uint32_t src_std_3; uint32_t reps_3; + // Initialize Event Unit once + eu_init(); + // Z - golden (reference) for (int i = 0; i < M_SIZE*N_SIZE; i++) mmio16(Z_BASE + 2*i) = x_inp[i]; @@ -61,12 +63,6 @@ int main(void) { printf("Z[%8x]: 0x%4x\n", Z_BASE + 2*i, mmio16(Z_BASE + 2*i)); #endif -#ifdef IRQ_EN - // Enable IRQs - uint32_t index = (1< 10 - // h_pprintf("dst_addr: 0x"); n_pprintf(hs(dst_addr)); - // h_pprintf("src_addr: 0x"); n_pprintf(hs(src_addr)); - // h_pprintf("len: "); n_pprintf(ds(len)); printf("dst_addr: 0x%0x\n", dst_addr); printf("src_addr: 0x%0x\n", src_addr); printf("len: %0d\n", len); @@ -91,9 +83,6 @@ void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uin src_std_2 = 0; reps_2 = 1; #if VERBOSE > 100 - // h_pprintf("dst_std_2: 0x"); n_pprintf(hs(dst_std_2)); - // h_pprintf("src_std_2: 0x"); n_pprintf(hs(src_std_2)); - // h_pprintf("reps_2: 0x"); n_pprintf(hs(reps_2)); printf("dst_std_2: 0x%0x\n", dst_std_2); printf("src_std_2: 0x%0x\n", src_std_2); printf("reps_2: 0x%0x\n", reps_2); @@ -104,9 +93,6 @@ void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uin src_std_3 = 0; reps_3 = 1; #if VERBOSE > 100 - // h_pprintf("dst_std_3: 0x"); n_pprintf(hs(dst_std_3)); - // h_pprintf("src_std_3: 0x"); n_pprintf(hs(src_std_3)); - // h_pprintf("reps_3: 0x"); n_pprintf(hs(reps_3)); printf("dst_std_3: 0x%0x\n", dst_std_3); printf("src_std_3: 0x%0x\n", src_std_3); printf("reps_3: 0x%0x\n", reps_3); @@ -115,17 +101,15 @@ void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uin idma_start_in(); -#ifdef IRQ_EN - asm volatile("wfi" ::: "memory"); - // h_pprintf("Detected IRQ...\n"); - printf("Detected IRQ...\n"); -#else - wait_print(WAIT_CYCLES); -#endif + // Clear Event Unit and ensure A2O mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_A2O_DONE_MASK); + + // Use direction-specific wait for L2->L1 (A2O) + eu_wait_events_polling(EU_IDMA_A2O_DONE_MASK, 10000000); #if VERBOSE > 100 for (int i = 0; i < x_dim*y_dim; i++){ - // h_pprintf("DST[0x"); pprintf(hs(dst_addr + 2*i)); pprintf("]: 0x"); n_pprintf(hs(mmio16(dst_addr + 2*i))); printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); } #endif @@ -136,12 +120,9 @@ void idma_mv_in(unsigned int x_dim, unsigned int y_dim, uint16_t src_data[], uin for (int i = 0; i < x_dim*y_dim; i++) { if (mmio16(dst_addr + 2*i) != src_data[i]) { num_errors++; - // h_pprintf("DST[0x"); pprintf(hs(dst_addr + 2*i)); pprintf("]: 0x"); pprintf(hs(mmio16(dst_addr + 2*i))); - // pprintf(" != SRC["); pprintf(ds(i)); pprintf("]: 0x"); n_pprintf(ds(src_data[i])); printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, src_data[i]); } } - // h_pprintf("Detected "); pprintf(ds(num_errors)); n_pprintf(" error(s) in the transfer..."); printf("Detected %0d error(s) in the transfer...\n", num_errors); #endif } @@ -159,19 +140,12 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u uint32_t src_std_3; uint32_t reps_3; -#ifdef IRQ_EN - irq_en(1< 10 - // h_pprintf("dst_addr: 0x"); n_pprintf(hs(dst_addr)); - // h_pprintf("src_addr: 0x"); n_pprintf(hs(src_addr)); - // h_pprintf("len: "); n_pprintf(ds(len)); printf("dst_addr: 0x%0x\n", dst_addr); printf("src_addr: 0x%0x\n", src_addr); printf("len: %0d\n", len); @@ -182,9 +156,6 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u src_std_2 = 0; reps_2 = 1; #if VERBOSE > 100 - // h_pprintf("dst_std_2: 0x"); n_pprintf(hs(dst_std_2)); - // h_pprintf("src_std_2: 0x"); n_pprintf(hs(src_std_2)); - // h_pprintf("reps_2: 0x"); n_pprintf(hs(reps_2)); printf("dst_std_2: 0x%0x\n", dst_std_2); printf("src_std_2: 0x%0x\n", src_std_2); printf("reps_2: 0x%0x\n", reps_2); @@ -195,9 +166,6 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u src_std_3 = 0; reps_3 = 1; #if VERBOSE > 100 - // h_pprintf("dst_std_3: 0x"); n_pprintf(hs(dst_std_3)); - // h_pprintf("src_std_3: 0x"); n_pprintf(hs(src_std_3)); - // h_pprintf("reps_3: 0x"); n_pprintf(hs(reps_3)); printf("dst_std_3: 0x%0x\n", dst_std_3); printf("src_std_3: 0x%0x\n", src_std_3); printf("reps_3: 0x%0x\n", reps_3); @@ -206,17 +174,15 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u idma_start_out(); -#ifdef IRQ_EN - asm volatile("wfi" ::: "memory"); - // h_pprintf("Detected IRQ...\n"); - printf("Detected IRQ...\n"); -#else - wait_print(WAIT_CYCLES); -#endif + // Clear Event Unit and ensure O2A mask is enabled + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_O2A_DONE_MASK); + + // Use direction-specific wait for L1->L2 (O2A) + eu_wait_events_polling(EU_IDMA_O2A_DONE_MASK, 10000000); #if VERBOSE > 100 for (int i = 0; i < x_dim*y_dim; i++){ - // h_pprintf("DST[0x"); pprintf(hs(dst_addr + 2*i)); pprintf("]: 0x"); n_pprintf(hs(mmio16(dst_addr + 2*i))); printf("DST[0x%0x]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i)); } #endif @@ -227,66 +193,51 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u for (int i = 0; i < x_dim*y_dim; i++) { if (mmio16(dst_addr + 2*i) != mmio16(src_addr + 2*i)) { num_errors++; - // h_pprintf("DST[0x"); pprintf(hs(dst_addr + 2*i)); pprintf("]: 0x"); pprintf(hs(mmio16(dst_addr + 2*i))); - // pprintf(" != SRC[0x"); pprintf(hs(src_addr + 2*i)); pprintf("]: 0x"); n_pprintf(hs(mmio16(src_addr + 2*i))); printf("DST[0x%0x]: 0x%0x != SRC[%0d]: 0x%0x\n", dst_addr + 2*i, mmio16(dst_addr + 2*i), i, mmio16(src_addr + 2*i)); } } - // h_pprintf("Detected "); pprintf(ds(num_errors)); n_pprintf(" error(s) in the transfer..."); printf("Detected %0d error(s) in the transfer...\n", num_errors); #endif } int main(void) { + // Initialize event unit + eu_init(); + // X - // h_pprintf("Initializing X through iDMA...\n"); printf("Initializing X through iDMA...\n"); idma_mv_in(M_SIZE, N_SIZE, x_inp, (X_BASE + get_hartid()*L1_TILE_OFFSET)); // W - // h_pprintf("Initializing W through iDMA...\n"); printf("Initializing W through iDMA...\n"); idma_mv_in(N_SIZE, K_SIZE, w_inp, (W_BASE + get_hartid()*L1_TILE_OFFSET)); // Y - // h_pprintf("Initializing Y through iDMA...\n"); printf("Initializing Y through iDMA...\n"); idma_mv_in(M_SIZE, K_SIZE, y_inp, (Y_BASE + get_hartid()*L1_TILE_OFFSET)); #if VERBOSE > 10 - // h_pprintf("K_SIZE: 0x"); n_pprintf(hs(K_SIZE)); - // h_pprintf("M_SIZE: 0x"); n_pprintf(hs(M_SIZE)); - // h_pprintf("N_SIZE: 0x"); n_pprintf(hs(N_SIZE)); printf("K_SIZE: 0x%0x\n", K_SIZE); printf("M_SIZE: 0x%0x\n", M_SIZE); printf("N_SIZE: 0x%0x\n", N_SIZE); #endif -#ifdef IRQ_EN - irq_en(1< expected[get_hartid()]) ? (computed[get_hartid()] - expected[get_hartid()]) : (expected[get_hartid()] - computed[get_hartid()]); if(diff[get_hartid()] > DIFF_TH){ num_errors[get_hartid()]++; - // h_pprintf("**ERROR**: V[0x"); pprintf(hs(V_BASE + get_hartid()*MHARTID_OFFSET + 2*i)); pprintf("](=0x"); pprintf(hs(computed[get_hartid()])); - // pprintf(") != Z["); pprintf(ds(i)); pprintf("](=0x"); pprintf(hs(expected[get_hartid()])); n_pprintf(")"); printf("**ERROR**: V[0x%0x](=0x%0x) != Z[%0d](=0x%0x)\n", V_BASE + get_hartid()*MHARTID_OFFSET + 2*i, computed[get_hartid()], i, expected[get_hartid()]); } } - // h_pprintf("Finished test with "); pprintf(ds(num_errors[get_hartid()])); n_pprintf(" error(s)"); printf("Finished test with %0d error(s)\n", num_errors[get_hartid()]); uint32_t exit_code[NUM_HARTS]; diff --git a/sw/tests/redmule_test.c b/sw/tests/xif_tests/redmule_test.c similarity index 91% rename from sw/tests/redmule_test.c rename to sw/tests/xif_tests/redmule_test.c index ecaef3f..4b4ca57 100644 --- a/sw/tests/redmule_test.c +++ b/sw/tests/xif_tests/redmule_test.c @@ -21,6 +21,7 @@ #include "magia_tile_utils.h" #include "redmule_isa_utils.h" +#include "event_unit_utils.h" #include "x_input.h" #include "w_input.h" @@ -38,13 +39,14 @@ #define VERBOSE (0) -#define IRQ_EN - #define WAIT_CYCLES (10) #define DIFF_TH (0x0011) int main(void) { + // Initialize Event Unit for RedMulE + eu_redmule_init(); + // X for (int i = 0; i < M_SIZE*N_SIZE; i++) mmio16(X_BASE + 2*i) = x_inp[i]; @@ -83,24 +85,14 @@ int main(void) { printf("N_SIZE: %4x\n", N_SIZE); #endif - redmule_mcnfig(K_SIZE, M_SIZE, N_SIZE); + // Wait for end of computation + printf("Testing matrix multiplication with RedMulE...\n"); + redmule_mcnfig(K_SIZE, M_SIZE, N_SIZE); redmule_marith(Y_BASE, W_BASE, X_BASE); -#ifdef IRQ_EN - // Enable IRQs - uint32_t index = (1< 100 for (int i = 0; i < x_dim*y_dim; i++) @@ -141,10 +134,6 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u uint32_t src_std_3; uint32_t reps_3; -#ifdef IRQ_EN - irq_en(1< 100 for (int i = 0; i < x_dim*y_dim; i++) @@ -205,6 +192,9 @@ void idma_mv_out(unsigned int x_dim, unsigned int y_dim, uint32_t src_address, u } int main(void) { + // Initialize event unit + eu_init(); + // X printf("Initializing X through iDMA...\n"); idma_mv_in(M_SIZE, N_SIZE, x_inp, X_BASE); @@ -232,23 +222,15 @@ int main(void) { printf("N_SIZE: %4x\n", N_SIZE); #endif - redmule_mcnfig(K_SIZE, M_SIZE, N_SIZE); - - redmule_marith(Y_BASE, W_BASE, X_BASE); + printf("Testing matrix multiplication with RedMulE...\n"); -#ifdef IRQ_EN - irq_en(1< + * + * MAGIA Event Unit Utilities + * Two modes: POLLING (non-blocking) and WFE (blocking with p.elw sleep) + */ + +#ifndef EVENT_UNIT_UTILS_H +#define EVENT_UNIT_UTILS_H + +#include +#include "magia_tile_utils.h" + +//============================================================================= +// REGISTER DEFINITIONS AND CONSTANTS +//============================================================================= + +#define EU_BASE EVENT_UNIT_BASE + +// Control and status registers +#define EU_CORE_MASK (EU_BASE + 0x00) +#define EU_CORE_MASK_AND (EU_BASE + 0x04) +#define EU_CORE_MASK_OR (EU_BASE + 0x08) +#define EU_CORE_IRQ_MASK (EU_BASE + 0x0C) +#define EU_CORE_IRQ_MASK_AND (EU_BASE + 0x10) +#define EU_CORE_IRQ_MASK_OR (EU_BASE + 0x14) +#define EU_CORE_STATUS (EU_BASE + 0x18) +#define EU_CORE_BUFFER (EU_BASE + 0x1C) +#define EU_CORE_BUFFER_MASKED (EU_BASE + 0x20) +#define EU_CORE_BUFFER_IRQ_MASKED (EU_BASE + 0x24) +#define EU_CORE_BUFFER_CLEAR (EU_BASE + 0x28) + +// Wait registers (blocking with p.elw) +#define EU_CORE_EVENT_WAIT (EU_BASE + 0x38) +#define EU_CORE_EVENT_WAIT_CLEAR (EU_BASE + 0x3C) + +// Hardware mutex registers (0x04 * mutex_id offset) +#define EU_CORE_HW_MUTEX (EU_BASE + 0x0C0) // R/W: HW mutex management + +// Hardware barrier registers (0x20 * barr_id offset) +#define HW_BARR_TRIGGER_MASK (EU_BASE + 0x400) // R/W: Barrier trigger mask +#define HW_BARR_STATUS (EU_BASE + 0x404) // R: Barrier status +#define HW_BARR_TARGET_MASK (EU_BASE + 0x40C) // R/W: Barrier target mask +#define HW_BARR_TRIGGER (EU_BASE + 0x410) // W: Manual barrier trigger +#define HW_BARR_TRIGGER_SELF (EU_BASE + 0x414) // R: Automatic trigger +#define HW_BARR_TRIGGER_WAIT (EU_BASE + 0x418) // R: Trigger + sleep +#define HW_BARR_TRIGGER_WAIT_CLEAR (EU_BASE + 0x41C) // R: Trigger + sleep + clear + +// Software event trigger registers (0x04 * sw_event_id offset) +#define EU_CORE_TRIGG_SW_EVENT (EU_BASE + 0x600) // W: Generate SW event +#define EU_CORE_TRIGG_SW_EVENT_WAIT (EU_BASE + 0x640) // R: Generate event + sleep +#define EU_CORE_TRIGG_SW_EVENT_WAIT_CLEAR (EU_BASE + 0x680) // R: Generate event + sleep + clear + +// SoC event FIFO register +#define EU_CORE_CURRENT_EVENT (EU_BASE + 0x700) // R: SoC event FIFO + +// Event bit mapping +#define EU_DMA_EVT_0_BIT 2 +#define EU_DMA_EVT_1_BIT 3 +#define EU_TIMER_EVT_0_BIT 4 +#define EU_TIMER_EVT_1_BIT 5 + +#define EU_REDMULE_UNUSED_BIT 8 +#define EU_REDMULE_BUSY_BIT 9 +#define EU_REDMULE_DONE_BIT 10 +#define EU_REDMULE_EVT1_BIT 11 + +// RedMulE event masks +#define EU_REDMULE_DONE_MASK (1 << EU_REDMULE_DONE_BIT) +#define EU_REDMULE_BUSY_MASK (1 << EU_REDMULE_BUSY_BIT) +#define EU_REDMULE_ALL_MASK 0x0F00 + +// iDMA events (DMA events [3:2] + extended [31:26]) +#define EU_IDMA_A2O_DONE_BIT 2 +#define EU_IDMA_O2A_DONE_BIT 3 +#define EU_IDMA_A2O_DONE_MASK (1 << EU_IDMA_A2O_DONE_BIT) +#define EU_IDMA_O2A_DONE_MASK (1 << EU_IDMA_O2A_DONE_BIT) +#define EU_IDMA_ALL_DONE_MASK (EU_IDMA_A2O_DONE_MASK | EU_IDMA_O2A_DONE_MASK) +#define EU_IDMA_A2O_ERROR_BIT 26 +#define EU_IDMA_O2A_ERROR_BIT 27 +#define EU_IDMA_A2O_START_BIT 28 +#define EU_IDMA_O2A_START_BIT 29 +#define EU_IDMA_A2O_BUSY_BIT 30 +#define EU_IDMA_O2A_BUSY_BIT 31 +#define EU_IDMA_A2O_ERROR_MASK (1 << EU_IDMA_A2O_ERROR_BIT) +#define EU_IDMA_O2A_ERROR_MASK (1 << EU_IDMA_O2A_ERROR_BIT) +#define EU_IDMA_A2O_START_MASK (1 << EU_IDMA_A2O_START_BIT) +#define EU_IDMA_O2A_START_MASK (1 << EU_IDMA_O2A_START_BIT) +#define EU_IDMA_A2O_BUSY_MASK (1 << EU_IDMA_A2O_BUSY_BIT) +#define EU_IDMA_O2A_BUSY_MASK (1 << EU_IDMA_O2A_BUSY_BIT) + +// FSync events (cluster events [25:24]) +#define EU_FSYNC_DONE_BIT 24 +#define EU_FSYNC_ERROR_BIT 25 +#define EU_FSYNC_DONE_MASK (1 << EU_FSYNC_DONE_BIT) +#define EU_FSYNC_ERROR_MASK (1 << EU_FSYNC_ERROR_BIT) +#define EU_FSYNC_ALL_MASK (EU_FSYNC_DONE_MASK | EU_FSYNC_ERROR_MASK) + +// Wait modes +typedef enum { + EU_WAIT_MODE_POLLING = 0, + EU_WAIT_MODE_WFE = 1 +} eu_wait_mode_t; + +//============================================================================= +// LOW-LEVEL HAL (PULP-compatible evt_read32) +//============================================================================= + +// evt_read32: blocking read with p.elw instruction +static inline unsigned int evt_read32(unsigned int base, unsigned int offset) { + unsigned int value; + unsigned int addr = base + offset; + // Direct p.elw inline assembly for PULP cores (RI5CY, CV32E40P) + __asm__ __volatile__ ( + "p.elw %0, 0(%1)" + : "=r" (value) + : "r" (addr) + : "memory" + ); + return value; +} + +//============================================================================= +// BASIC CONTROL FUNCTIONS +//============================================================================= + +static inline void eu_init(void) { + mmio32(EU_CORE_BUFFER_CLEAR) = 0xFFFFFFFF; + mmio32(EU_CORE_MASK) = 0x00000000; + mmio32(EU_CORE_IRQ_MASK) = 0x00000000; +} + +static inline void eu_enable_events(uint32_t event_mask) { + mmio32(EU_CORE_MASK_OR) = event_mask; +} + +static inline void eu_disable_events(uint32_t event_mask) { + mmio32(EU_CORE_MASK_AND) = event_mask; +} + +static inline void eu_enable_irq(uint32_t irq_mask) { + mmio32(EU_CORE_IRQ_MASK_OR) = irq_mask; +} + +static inline void eu_disable_irq(uint32_t irq_mask) { + mmio32(EU_CORE_IRQ_MASK_AND) = irq_mask; +} + +static inline void eu_clear_events(uint32_t event_mask) { + mmio32(EU_CORE_BUFFER_CLEAR) = event_mask; +} + +//============================================================================= +// STATUS READ FUNCTIONS (non-blocking) +//============================================================================= + +static inline uint32_t eu_get_events(void) { + return mmio32(EU_CORE_BUFFER); +} + +static inline uint32_t eu_get_events_masked(void) { + return mmio32(EU_CORE_BUFFER_MASKED); +} + +static inline uint32_t eu_check_events(uint32_t event_mask) { + return mmio32(EU_CORE_BUFFER_MASKED) & event_mask; +} + +//============================================================================= +// WAIT FUNCTIONS (polling and blocking) +//============================================================================= + +// POLLING mode: non-blocking busy-wait +static inline uint32_t eu_wait_events_polling(uint32_t event_mask, uint32_t timeout_cycles) { + uint32_t cycles = 0; + uint32_t detected_events; + do { + detected_events = eu_check_events(event_mask); + if (detected_events){ + eu_clear_events(detected_events); + return detected_events; + } + wait_nop(10); + cycles += 10; + } while (timeout_cycles == 0 || cycles < timeout_cycles); + return 0; +} + +// WFE mode: blocking sleep with p.elw +static inline uint32_t eu_wait_events_wfe(uint32_t event_mask) { + eu_enable_events(event_mask); + return evt_read32(EU_BASE, EU_CORE_EVENT_WAIT_CLEAR - EU_BASE); +} + +// Generic wait with mode selection +static inline uint32_t eu_wait_events(uint32_t event_mask, eu_wait_mode_t mode, uint32_t timeout_cycles) { + if (mode == EU_WAIT_MODE_WFE) + return eu_wait_events_wfe(event_mask); + else + return eu_wait_events_polling(event_mask, timeout_cycles); +} + +// PULP HAL compatible functions +static inline unsigned int eu_evt_wait(void) { + return evt_read32(EU_BASE, EU_CORE_EVENT_WAIT - EU_BASE); +} + +static inline unsigned int eu_evt_waitAndClr(void) { + return evt_read32(EU_BASE, EU_CORE_EVENT_WAIT_CLEAR - EU_BASE); +} + +static inline unsigned int eu_evt_maskWaitAndClr(unsigned int evtMask) { + eu_enable_events(evtMask); + unsigned int result = eu_evt_waitAndClr(); + eu_disable_events(evtMask); + return result; +} + +//============================================================================= +// REDMULE FUNCTIONS +//============================================================================= + +static inline void eu_redmule_init(void) { + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_REDMULE_DONE_MASK); +} + +static inline uint32_t eu_redmule_wait_completion(eu_wait_mode_t mode) { + return eu_wait_events(EU_REDMULE_DONE_MASK, mode, 1000000); +} + +static inline uint32_t eu_redmule_is_busy(void) { + return eu_check_events(EU_REDMULE_BUSY_MASK); +} + +static inline uint32_t eu_redmule_is_done(void) { + return eu_check_events(EU_REDMULE_DONE_MASK); +} + +//============================================================================= +// IDMA FUNCTIONS +//============================================================================= + +static inline void eu_idma_init(void) { + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_IDMA_ALL_DONE_MASK); +} + +static inline uint32_t eu_idma_wait_completion(eu_wait_mode_t mode) { + return eu_wait_events(EU_IDMA_ALL_DONE_MASK, mode, 1000000); +} + +static inline uint32_t eu_idma_wait_direction_completion(uint32_t direction, eu_wait_mode_t mode) { + uint32_t wait_mask = direction ? EU_IDMA_O2A_DONE_MASK : EU_IDMA_A2O_DONE_MASK; + return eu_wait_events(wait_mask, mode, 1000000); +} + +static inline uint32_t eu_idma_wait_a2o_completion(eu_wait_mode_t mode) { + return eu_wait_events(EU_IDMA_A2O_DONE_MASK, mode, 1000000); +} + +static inline uint32_t eu_idma_wait_o2a_completion(eu_wait_mode_t mode) { + return eu_wait_events(EU_IDMA_O2A_DONE_MASK, mode, 1000000); +} + +static inline uint32_t eu_idma_is_done(void) { + return eu_check_events(EU_IDMA_ALL_DONE_MASK); +} + +static inline uint32_t eu_idma_a2o_is_done(void) { + return eu_check_events(EU_IDMA_A2O_DONE_MASK); +} + +static inline uint32_t eu_idma_o2a_is_done(void) { + return eu_check_events(EU_IDMA_O2A_DONE_MASK); +} + +static inline uint32_t eu_idma_is_busy(void) { + uint32_t events = eu_get_events(); + return events & (EU_IDMA_A2O_BUSY_MASK | EU_IDMA_O2A_BUSY_MASK); +} + +static inline uint32_t eu_idma_has_error(void) { + uint32_t events = eu_get_events(); + return events & (EU_IDMA_A2O_ERROR_MASK | EU_IDMA_O2A_ERROR_MASK); +} + +//============================================================================= +// FSYNC FUNCTIONS +//============================================================================= + +static inline void eu_fsync_init(void) { + eu_clear_events(0xFFFFFFFF); + eu_enable_events(EU_FSYNC_ALL_MASK); +} + +static inline uint32_t eu_fsync_wait_completion(eu_wait_mode_t mode) { + return eu_wait_events(EU_FSYNC_DONE_MASK, mode, 1000000); +} + +static inline uint32_t eu_fsync_is_done(void) { + return eu_check_events(EU_FSYNC_DONE_MASK); +} + +static inline uint32_t eu_fsync_has_error(void) { + return eu_check_events(EU_FSYNC_ERROR_MASK); +} + +//============================================================================= +// MULTI-ACCELERATOR FUNCTIONS +//============================================================================= + +static inline void eu_multi_init(uint32_t redmule_en, uint32_t idma_a2o_en, + uint32_t idma_o2a_en, uint32_t fsync_en) { + eu_clear_events(0xFFFFFFFF); + uint32_t event_mask = 0; + + if (redmule_en) { + event_mask |= EU_REDMULE_ALL_MASK; + } + if (idma_a2o_en) { + event_mask |= EU_IDMA_A2O_DONE_MASK; + } + if (idma_o2a_en) { + event_mask |= EU_IDMA_O2A_DONE_MASK; + } + if (fsync_en) { + event_mask |= EU_FSYNC_ALL_MASK; + } + + if (event_mask) eu_enable_events(event_mask); +} + +static inline uint32_t eu_multi_wait_all(uint32_t wait_redmule, uint32_t wait_idma_a2o, + uint32_t wait_idma_o2a, uint32_t wait_fsync, + eu_wait_mode_t mode) { + uint32_t required_mask = 0; + if (wait_redmule) required_mask |= EU_REDMULE_DONE_MASK; + if (wait_idma_a2o) required_mask |= EU_IDMA_A2O_DONE_MASK; + if (wait_idma_o2a) required_mask |= EU_IDMA_O2A_DONE_MASK; + if (wait_fsync) required_mask |= EU_FSYNC_DONE_MASK; + + eu_enable_events(required_mask); + + if (mode == EU_WAIT_MODE_WFE) { + uint32_t accumulated = 0; + while ((accumulated & required_mask) != required_mask) { + uint32_t new_events = evt_read32(EU_BASE, EU_CORE_EVENT_WAIT_CLEAR - EU_BASE); + accumulated |= new_events; + } + return accumulated; + } else { + uint32_t timeout = 1000000; + uint32_t cycles = 0; + uint32_t accumulated = 0; + while (cycles < timeout && (accumulated & required_mask) != required_mask) { + accumulated |= eu_check_events(required_mask); + wait_nop(10); + cycles += 10; + } + return accumulated; + } +} + +#endif /* EVENT_UNIT_UTILS_H */ diff --git a/sw/utils/fsync_mm_api.h b/sw/utils/fsync_mm_api.h new file mode 100644 index 0000000..93f82b6 --- /dev/null +++ b/sw/utils/fsync_mm_api.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_api.h by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Synchronization Patterns API + */ + +#ifndef FSYNC_MM_API_H +#define FSYNC_MM_API_H + + #include "fsync_mm_utils.h" + #include "magia_tile_utils.h" + #include "magia_utils.h" + + #define _FS_MM_GLOBAL_AGGR (0xFFFFFFFF >> (1+__builtin_clz(NUM_HARTS))) + #define _FS_MM_GLOBAL_ID (-1) + #define _FS_MM_HNBR_AGGR (0x1) + #define _FS_MM_HNBR_ID (0) + #define _FS_MM_VNBR_AGGR (0x1) + #define _FS_MM_VNBR_ID (1) + #define _FS_MM_HRING_AGGR (0x1) + #define _FS_MM_HRING_ID (2) + #define _FS_MM_VRING_AGGR (0x1) + #define _FS_MM_VRING_ID (3) + #define _FS_MM_RC_LVL (0x1 << (29-__builtin_clz(NUM_HARTS))) + #define _FS_MM_RC_AGGR (0x155 >> (__builtin_clz(NUM_HARTS)-21)) + + // Lookup table indicating the id of row synchronization + uint32_t row_id_lookup_mm(volatile uint32_t hartid_y){ + if (hartid_y < MESH_Y_TILES/2) return 2*hartid_y; + else return 2*(hartid_y-MESH_Y_TILES/2); + } + + // Lookup table indicating the id of column synchronization + uint32_t col_id_lookup_mm(volatile uint32_t hartid_x){ + if (hartid_x < MESH_X_TILES/2) return 2*hartid_x+1; + else return 2*(hartid_x-MESH_X_TILES/2)+1; + } + + static inline void fsync_mm_hnbr(){ + fsync_mm(_FS_MM_HNBR_ID, _FS_MM_HNBR_AGGR); + } + + static inline void fsync_mm_vnbr(){ + fsync_mm(_FS_MM_VNBR_ID, _FS_MM_VNBR_AGGR); + } + + void fsync_mm_hring(){ + uint32_t hartid = get_hartid(); + uint32_t hartid_x = GET_X_ID(hartid); + uint32_t hartid_y = GET_Y_ID(hartid); + if ((hartid_x == 0) || (hartid_x == MESH_X_TILES-1)){ + uint32_t id = row_id_lookup_mm(hartid_y); + fsync_mm(id, _FS_MM_RC_LVL); + } else { + fsync_mm(_FS_MM_HRING_ID, _FS_MM_HRING_AGGR); + } + } + + void fsync_mm_vring(){ + uint32_t hartid = get_hartid(); + uint32_t hartid_x = GET_X_ID(hartid); + uint32_t hartid_y = GET_Y_ID(hartid); + if ((hartid_y == 0) || (hartid_y == MESH_Y_TILES-1)){ + uint32_t id = col_id_lookup_mm(hartid_x); + fsync_mm(id, _FS_MM_RC_LVL); + } else { + fsync_mm(_FS_MM_VRING_ID, _FS_MM_VRING_AGGR); + } + } + + void fsync_mm_rows(){ + uint32_t hartid = get_hartid(); + uint32_t hartid_y = GET_Y_ID(hartid); + uint32_t id = row_id_lookup_mm(hartid_y); + fsync_mm(id, _FS_MM_RC_AGGR); + } + + void fsync_mm_cols(){ + uint32_t hartid = get_hartid(); + uint32_t hartid_x = GET_X_ID(hartid); + uint32_t id = col_id_lookup_mm(hartid_x); + fsync_mm(id, _FS_MM_RC_AGGR); + } + + static inline void fsync_mm_global(){ + fsync_mm(_FS_MM_GLOBAL_ID, _FS_MM_GLOBAL_AGGR); + } + + + +#endif /*FSYNC_MM_API_H*/ \ No newline at end of file diff --git a/sw/utils/fsync_mm_utils.h b/sw/utils/fsync_mm_utils.h new file mode 100644 index 0000000..8398f98 --- /dev/null +++ b/sw/utils/fsync_mm_utils.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on fsync_isa_utils.h by Victor Isachi + * + * MAGIA FractalSync Memory-Mapped Utils + * WARNING: Make sure to undefine EVENT_UNIT in this file if POLLING in registers mm is desired, otherwise polling mode will not work correctly + */ + +#ifndef FSYNC_MM_UTILS_H +#define FSYNC_MM_UTILS_H +#define EVENT_UNIT + +#include "magia_tile_utils.h" + +/* Memory-mapped FractalSync register offsets */ +#define FSYNC_MM_AGGR_REG_OFFSET (0x00) +#define FSYNC_MM_ID_REG_OFFSET (0x04) +#define FSYNC_MM_CONTROL_REG_OFFSET (0x08) +#define FSYNC_MM_STATUS_REG_OFFSET (0x0C) + +/* Status register bits */ +#define FSYNC_MM_STATUS_BUSY_MASK (1 << 2) + +/* Memory-mapped sync function */ +static inline void fsync_mm(volatile uint32_t id, volatile uint32_t aggregate){ + volatile char *fsync_base = (volatile char *)(FSYNC_BASE); + + *(volatile uint32_t *)(fsync_base + FSYNC_MM_AGGR_REG_OFFSET) = aggregate; + *(volatile uint32_t *)(fsync_base + FSYNC_MM_ID_REG_OFFSET) = id; + *(volatile uint32_t *)(fsync_base + FSYNC_MM_CONTROL_REG_OFFSET) = 1; + +#ifndef EVENT_UNIT + // Polling mode - wait for completion + volatile uint32_t status; + do { + status = *(volatile uint32_t *)(fsync_base + FSYNC_MM_STATUS_REG_OFFSET); + if (status & FSYNC_MM_STATUS_BUSY_MASK) { + printf("FSYNC_MM still busy...\n"); + // Still busy, optionally add a small delay here + } + } while (status & FSYNC_MM_STATUS_BUSY_MASK); +#endif + // In non-stalling mode, the function returns immediately + // and the caller should do wfi to wait for interrupt +} + +#endif /*FSYNC_MM_UTILS_H*/ \ No newline at end of file diff --git a/sw/utils/idma_mm_utils.h b/sw/utils/idma_mm_utils.h new file mode 100644 index 0000000..558a024 --- /dev/null +++ b/sw/utils/idma_mm_utils.h @@ -0,0 +1,355 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * Authors: Luca Balboni + * Based on idma_utils.h by Victor Isachi + * + * MAGIA iDMA Memory-Mapped I/O Utils + */ + +#ifndef IDMA_MM_UTILS_H +#define IDMA_MM_UTILS_H + +#include +#include "magia_tile_utils.h" + +//============================================================================= +// Register Definitions and Constants +//============================================================================= + +// iDMA Memory-Mapped Register Base Addresses +#define IDMA_MM_DIRECTION_OFFSET (0x200) +#define IDMA_MM_BASE_AXI2OBI (IDMA_BASE) // direction=0, L2 to L1 +#define IDMA_MM_BASE_OBI2AXI (IDMA_BASE + IDMA_MM_DIRECTION_OFFSET) // direction=1, L1 to L2 + +#define IDMA_CONF_OFFSET (0x00) +#define IDMA_STATUS_OFFSET (0x04) +#define IDMA_NEXT_ID_OFFSET (0x44) +#define IDMA_DONE_ID_OFFSET (0x84) +#define IDMA_DST_ADDR_LOW_OFFSET (0xD0) +#define IDMA_SRC_ADDR_LOW_OFFSET (0xD8) +#define IDMA_LENGTH_LOW_OFFSET (0xE0) +#define IDMA_DST_STRIDE_2_LOW_OFFSET (0xE8) +#define IDMA_SRC_STRIDE_2_LOW_OFFSET (0xF0) +#define IDMA_REPS_2_LOW_OFFSET (0xF8) +#define IDMA_DST_STRIDE_3_LOW_OFFSET (0x100) +#define IDMA_SRC_STRIDE_3_LOW_OFFSET (0x108) +#define IDMA_REPS_3_LOW_OFFSET (0x110) + +// Register Addresses - now direction-aware +#define IDMA_CONF_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_CONF_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_CONF_OFFSET)) +#define IDMA_STATUS_ADDR(is_l1_to_l2, id) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_STATUS_OFFSET + ((id) * 4)) : (IDMA_MM_BASE_AXI2OBI + IDMA_STATUS_OFFSET + ((id) * 4))) +#define IDMA_NEXT_ID_ADDR(is_l1_to_l2, id) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_NEXT_ID_OFFSET + ((id) * 4)) : (IDMA_MM_BASE_AXI2OBI + IDMA_NEXT_ID_OFFSET + ((id) * 4))) +#define IDMA_DONE_ID_ADDR(is_l1_to_l2, id) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_DONE_ID_OFFSET + ((id) * 4)) : (IDMA_MM_BASE_AXI2OBI + IDMA_DONE_ID_OFFSET + ((id) * 4))) +#define IDMA_DST_ADDR_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_DST_ADDR_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_DST_ADDR_LOW_OFFSET)) +#define IDMA_SRC_ADDR_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_SRC_ADDR_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_SRC_ADDR_LOW_OFFSET)) +#define IDMA_LENGTH_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_LENGTH_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_LENGTH_LOW_OFFSET)) +#define IDMA_DST_STRIDE_2_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_DST_STRIDE_2_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_DST_STRIDE_2_LOW_OFFSET)) +#define IDMA_SRC_STRIDE_2_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_SRC_STRIDE_2_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_SRC_STRIDE_2_LOW_OFFSET)) +#define IDMA_REPS_2_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_REPS_2_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_REPS_2_LOW_OFFSET)) +#define IDMA_DST_STRIDE_3_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_DST_STRIDE_3_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_DST_STRIDE_3_LOW_OFFSET)) +#define IDMA_SRC_STRIDE_3_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_SRC_STRIDE_3_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_SRC_STRIDE_3_LOW_OFFSET)) +#define IDMA_REPS_3_LOW_ADDR(is_l1_to_l2) ((is_l1_to_l2) ? (IDMA_MM_BASE_OBI2AXI + IDMA_REPS_3_LOW_OFFSET) : (IDMA_MM_BASE_AXI2OBI + IDMA_REPS_3_LOW_OFFSET)) + +// Configuration Register Bit Fields +#define IDMA_CONF_DECOUPLE_AW_BIT (0) +#define IDMA_CONF_DECOUPLE_RW_BIT (1) +#define IDMA_CONF_SRC_REDUCE_LEN_BIT (2) +#define IDMA_CONF_DST_REDUCE_LEN_BIT (3) +#define IDMA_CONF_SRC_MAX_LLEN_MASK (0x70) // bits 6:4 +#define IDMA_CONF_SRC_MAX_LLEN_SHIFT (4) +#define IDMA_CONF_DST_MAX_LLEN_MASK (0x380) // bits 9:7 +#define IDMA_CONF_DST_MAX_LLEN_SHIFT (7) +#define IDMA_CONF_ENABLE_ND_MASK (0xC00) // bits 11:10 +#define IDMA_CONF_ENABLE_ND_SHIFT (10) + +// Status Register Bit Fields +#define IDMA_STATUS_BUSY_MASK (0x3FF) // bits 9:0 + +// Transfer Direction Constants +#define IDMA_DIR_L2_TO_L1 (0) // AXI2OBI direction +#define IDMA_DIR_L1_TO_L2 (1) // OBI2AXI direction + +// Direction aliases +#define IDMA_EXT2LOC 0 // L2 to L1 (AXI2OBI) +#define IDMA_LOC2EXT 1 // L1 to L2 (OBI2AXI) + +// Transfer dimensions +#define IDMA_1D 0 +#define IDMA_2D 1 +#define IDMA_3D 2 + +// Protocol definitions +typedef enum { + IDMA_PROT_AXI = 0, // AXI protocol: L2 memory + IDMA_PROT_OBI = 1 // OBI protocol: L1 memory +} idma_prot_t; + +typedef unsigned int dma_ext_t; + +// Configuration macros +#define IDMA_DEFAULT_CONFIG 0x0 + +//============================================================================= +// Low-Level Register Access Functions +//============================================================================= + +static inline void idma_mm_conf_dir(uint32_t is_l1_to_l2, uint32_t decouple_aw, uint32_t decouple_rw, + uint32_t src_reduce_len, uint32_t dst_reduce_len, + uint32_t src_max_llen, uint32_t dst_max_llen, + uint32_t enable_nd) { + uint32_t conf_val = 0; + + if (decouple_aw) conf_val |= (1 << IDMA_CONF_DECOUPLE_AW_BIT); + if (decouple_rw) conf_val |= (1 << IDMA_CONF_DECOUPLE_RW_BIT); + if (src_reduce_len) conf_val |= (1 << IDMA_CONF_SRC_REDUCE_LEN_BIT); + if (dst_reduce_len) conf_val |= (1 << IDMA_CONF_DST_REDUCE_LEN_BIT); + + conf_val |= ((src_max_llen & 0x7) << IDMA_CONF_SRC_MAX_LLEN_SHIFT); + conf_val |= ((dst_max_llen & 0x7) << IDMA_CONF_DST_MAX_LLEN_SHIFT); + conf_val |= ((enable_nd & 0x3) << IDMA_CONF_ENABLE_ND_SHIFT); + + mmio32(IDMA_CONF_ADDR(is_l1_to_l2)) = conf_val; +} + +static inline void idma_mm_conf_default_dir(uint32_t is_l1_to_l2) { + idma_mm_conf_dir(is_l1_to_l2, 0, 0, 0, 0, 0, 0, 3); +} + +static inline uint32_t idma_mm_is_busy_dir(uint32_t is_l1_to_l2, uint32_t stream_id) { + if (stream_id >= 16) return 0; + uint32_t status = mmio32(IDMA_STATUS_ADDR(is_l1_to_l2, stream_id)); + return (status & IDMA_STATUS_BUSY_MASK) ? 1 : 0; +} + +static inline uint32_t idma_mm_start_transfer_dir(uint32_t is_l1_to_l2, uint32_t stream_id) { + if (stream_id >= 16) return 0; + uint32_t transfer_id = mmio32(IDMA_NEXT_ID_ADDR(is_l1_to_l2, stream_id)); + return transfer_id; +} + +static inline uint32_t idma_mm_get_done_id_dir(uint32_t is_l1_to_l2, uint32_t stream_id) { + if (stream_id >= 16) return 0; + return mmio32(IDMA_DONE_ID_ADDR(is_l1_to_l2, stream_id)); +} + +static inline void idma_mm_set_addr_len_dir(uint32_t is_l1_to_l2, uint32_t dst_addr, uint32_t src_addr, uint32_t length) { + mmio32(IDMA_DST_ADDR_LOW_ADDR(is_l1_to_l2)) = dst_addr; + mmio32(IDMA_SRC_ADDR_LOW_ADDR(is_l1_to_l2)) = src_addr; + mmio32(IDMA_LENGTH_LOW_ADDR(is_l1_to_l2)) = length; +} + +static inline void idma_mm_set_2d_params_dir(uint32_t is_l1_to_l2, uint32_t dst_stride_2, uint32_t src_stride_2, uint32_t reps_2) { + mmio32(IDMA_DST_STRIDE_2_LOW_ADDR(is_l1_to_l2)) = dst_stride_2; + mmio32(IDMA_SRC_STRIDE_2_LOW_ADDR(is_l1_to_l2)) = src_stride_2; + mmio32(IDMA_REPS_2_LOW_ADDR(is_l1_to_l2)) = reps_2; +} + +static inline void idma_mm_set_3d_params_dir(uint32_t is_l1_to_l2, uint32_t dst_stride_3, uint32_t src_stride_3, uint32_t reps_3) { + mmio32(IDMA_DST_STRIDE_3_LOW_ADDR(is_l1_to_l2)) = dst_stride_3; + mmio32(IDMA_SRC_STRIDE_3_LOW_ADDR(is_l1_to_l2)) = src_stride_3; + mmio32(IDMA_REPS_3_LOW_ADDR(is_l1_to_l2)) = reps_3; +} + +static inline uint32_t idma_mm_wait_for_completion(uint32_t direction, uint32_t transfer_id) { + if (transfer_id == 0) return 0; + + uint32_t is_l1_to_l2 = (direction == IDMA_DIR_L1_TO_L2) ? 1 : 0; + uint32_t stream_id = 0; + uint32_t timeout = 1000000; + + while (timeout-- > 0) { + uint32_t is_busy = idma_mm_is_busy_dir(is_l1_to_l2, stream_id); + + if (!is_busy) { + uint32_t done_id = idma_mm_get_done_id_dir(is_l1_to_l2, stream_id); + if (done_id == transfer_id) { + return 1; + } + } + + wait_nop(10); + } + + return 0; +} + +//============================================================================= +// High-Level DMA API - 1D Transfers +//============================================================================= + +// Forward declarations +static inline int idma_L1ToL2(unsigned int src, unsigned int dst, unsigned short size); +static inline int idma_L2ToL1(unsigned int src, unsigned int dst, unsigned short size); +static inline int idma_L1ToL1(unsigned int src, unsigned int dst, unsigned short size); +static inline int idma_L1ToL2_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps); +static inline int idma_L2ToL1_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps); +static inline int idma_L1ToL1_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps); + +static inline int dma_memcpy(dma_ext_t ext, unsigned int loc, unsigned short size, int ext2loc) { + if (ext2loc) + return idma_L2ToL1(ext, loc, size); + else + return idma_L1ToL2(loc, ext, size); +} + +static inline int dma_l1ToExt(dma_ext_t ext, unsigned int loc, unsigned short size) { + return idma_L1ToL2(loc, ext, size); +} + +static inline int dma_extToL1(unsigned int loc, dma_ext_t ext, unsigned short size) { + return idma_L2ToL1(ext, loc, size); +} + +static inline int idma_memcpy(unsigned int src, unsigned int dst, unsigned int size, + idma_prot_t src_prot, idma_prot_t dst_prot) { + if (src_prot == IDMA_PROT_OBI && dst_prot == IDMA_PROT_AXI) { + return idma_L1ToL2(src, dst, size); + } else if (src_prot == IDMA_PROT_AXI && dst_prot == IDMA_PROT_OBI) { + return idma_L2ToL1(src, dst, size); + } else if (src_prot == IDMA_PROT_OBI && dst_prot == IDMA_PROT_OBI) { + return idma_L1ToL1(src, dst, size); + } + return 0; +} + +static inline int idma_L1ToL2(unsigned int src, unsigned int dst, unsigned short size) { + idma_mm_conf_default_dir(1); + idma_mm_set_addr_len_dir(1, dst, src, size); + idma_mm_set_2d_params_dir(1, 0, 0, 1); + idma_mm_set_3d_params_dir(1, 0, 0, 1); + return idma_mm_start_transfer_dir(1, 0); +} + +static inline int idma_L2ToL1(unsigned int src, unsigned int dst, unsigned short size) { + idma_mm_conf_default_dir(0); + idma_mm_set_addr_len_dir(0, dst, src, size); + idma_mm_set_2d_params_dir(0, 0, 0, 1); + idma_mm_set_3d_params_dir(0, 0, 0, 1); + return idma_mm_start_transfer_dir(0, 0); +} + +// L1-to-L1 transfers: Remote to Local (PULL) +static inline int idma_L1ToL1_pull(unsigned int remote_src, unsigned int local_dst, unsigned short size) { + idma_mm_conf_default_dir(0); // AXI2OBI: read from remote (AXI), write to local (OBI) + idma_mm_set_addr_len_dir(0, local_dst, remote_src, size); + idma_mm_set_2d_params_dir(0, 0, 0, 1); + idma_mm_set_3d_params_dir(0, 0, 0, 1); + return idma_mm_start_transfer_dir(0, 0); +} + +// L1-to-L1 transfers: Local to Remote (PUSH) +static inline int idma_L1ToL1_push(unsigned int local_src, unsigned int remote_dst, unsigned short size) { + idma_mm_conf_default_dir(1); // OBI2AXI: read from local (OBI), write to remote (AXI) + idma_mm_set_addr_len_dir(1, remote_dst, local_src, size); + idma_mm_set_2d_params_dir(1, 0, 0, 1); + idma_mm_set_3d_params_dir(1, 0, 0, 1); + return idma_mm_start_transfer_dir(1, 0); +} + + +//============================================================================= +// High-Level DMA API - 2D Transfers +//============================================================================= + +static inline int idma_memcpy_2d(unsigned int src, unsigned int dst, unsigned int size, + unsigned int src_stride, unsigned int dst_stride, + unsigned int num_reps, idma_prot_t src_prot, idma_prot_t dst_prot) { + if (src_prot == IDMA_PROT_OBI && dst_prot == IDMA_PROT_AXI) { + return idma_L1ToL2_2d(src, dst, size, src_stride, dst_stride, num_reps); + } else if (src_prot == IDMA_PROT_AXI && dst_prot == IDMA_PROT_OBI) { + return idma_L2ToL1_2d(src, dst, size, src_stride, dst_stride, num_reps); + } else if (src_prot == IDMA_PROT_OBI && dst_prot == IDMA_PROT_OBI) { + return idma_L1ToL1_2d(src, dst, size, src_stride, dst_stride, num_reps); + } + return 0; +} + +static inline int idma_L1ToL2_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + idma_mm_conf_default_dir(1); + idma_mm_set_addr_len_dir(1, dst, src, size); + idma_mm_set_2d_params_dir(1, dst_stride, src_stride, num_reps); + idma_mm_set_3d_params_dir(1, 0, 0, 1); + return idma_mm_start_transfer_dir(1, 0); +} + +static inline int idma_L2ToL1_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + idma_mm_conf_default_dir(0); + idma_mm_set_addr_len_dir(0, dst, src, size); + idma_mm_set_2d_params_dir(0, dst_stride, src_stride, num_reps); + idma_mm_set_3d_params_dir(0, 0, 0, 1); + return idma_mm_start_transfer_dir(0, 0); +} + +// L1-to-L1 2D transfers: Remote to Local (PULL) +static inline int idma_L1ToL1_pull_2d(unsigned int remote_src, unsigned int local_dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + idma_mm_conf_default_dir(0); // AXI2OBI + idma_mm_set_addr_len_dir(0, local_dst, remote_src, size); + idma_mm_set_2d_params_dir(0, dst_stride, src_stride, num_reps); + idma_mm_set_3d_params_dir(0, 0, 0, 1); + return idma_mm_start_transfer_dir(0, 0); +} + +// L1-to-L1 2D transfers: Local to Remote (PUSH) +static inline int idma_L1ToL1_push_2d(unsigned int local_src, unsigned int remote_dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + idma_mm_conf_default_dir(1); // OBI2AXI + idma_mm_set_addr_len_dir(1, remote_dst, local_src, size); + idma_mm_set_2d_params_dir(1, dst_stride, src_stride, num_reps); + idma_mm_set_3d_params_dir(1, 0, 0, 1); + return idma_mm_start_transfer_dir(1, 0); +} + +// Legacy L1-to-L1 2D function (assumes remote->local for backward compatibility) +static inline int idma_L1ToL1_2d(unsigned int src, unsigned int dst, unsigned short size, + unsigned int src_stride, unsigned int dst_stride, unsigned int num_reps) { + return idma_L1ToL1_pull_2d(src, dst, size, src_stride, dst_stride, num_reps); +} + +//============================================================================= +// Status and Wait Functions +//============================================================================= + +static inline unsigned int idma_tx_cplt(unsigned int dma_tx_id) { + uint32_t done_id_axi2obi = idma_mm_get_done_id_dir(0, 0); + uint32_t done_id_obi2axi = idma_mm_get_done_id_dir(1, 0); + + return (done_id_axi2obi == dma_tx_id) || (done_id_obi2axi == dma_tx_id); +} + +static inline unsigned int dma_status() { + return idma_mm_is_busy_dir(0, 0) || idma_mm_is_busy_dir(1, 0); +} + +static inline void dma_wait(unsigned int dma_tx_id) { + while(!idma_tx_cplt(dma_tx_id)) { + wait_nop(1); + } +} + +static inline void dma_barrier() { + while(dma_status()) { + wait_nop(1); + } +} + +#endif /*IDMA_MM_UTILS_H*/ \ No newline at end of file diff --git a/sw/utils/magia_tile_utils.h b/sw/utils/magia_tile_utils.h index f5c8006..dda3f7d 100644 --- a/sw/utils/magia_tile_utils.h +++ b/sw/utils/magia_tile_utils.h @@ -22,6 +22,7 @@ #ifndef MAGIA_TILE_UTILS_H #define MAGIA_TILE_UTILS_H +#include #include "tinyprintf.h" #define NUM_L1_BANKS (32) @@ -29,20 +30,30 @@ #define BITS_WORD (32) #define BITS_BYTE (8) -#define RESERVED_START (0x00000000) -#define RESERVED_END (0x0000FFFF) -#define STACK_START (0x00010000) -#define STACK_END (0x0001FFFF) -#define L1_BASE (0x00020000) -#define L1_SIZE (0x000E0000) -#define L1_TILE_OFFSET (0x00100000) -#define L2_BASE (0xCC000000) -#define TEST_END_ADDR (0xCC030000) +#define REDMULE_BASE (0x00000100) +#define REDMULE_END (0x000001FF) +#define IDMA_BASE (0x00000200) +#define IDMA_END (0x000005FF) +#define FSYNC_BASE (0x00000600) +#define FSYNC_END (0x000006FF) +#define EVENT_UNIT_BASE (0x00000700) +#define EVENT_UNIT_END (0x000016FF) +#define RESERVED_START (0x00001700) +#define RESERVED_END (0x0000FFFF) +#define STACK_START (0x00010000) +#define STACK_END (0x0001FFFF) +#define L1_BASE (0x00020000) +#define L1_SIZE (0x000E0000) +#define L1_TILE_OFFSET (0x00100000) +#define L2_BASE (0xCC000000) +#define TEST_END_ADDR (0xCC030000) #define DEFAULT_EXIT_CODE (0xDEFC) #define PASS_EXIT_CODE (0xAAAA) #define FAIL_EXIT_CODE (0xFFFF) +// Individual IRQ indices unnecessary - Event Unit provides unified interrupt management +// Use Event Unit API (event_unit_utils.h) for event handling #define IRQ_REDMULE_EVT_0 (31) #define IRQ_REDMULE_EVT_1 (30) #define IRQ_A2O_ERROR (29) @@ -112,17 +123,32 @@ static inline void sentinel_end(){ } static inline void ccount_en(){ +#ifdef CV32E40X asm volatile("csrrci zero, 0x320, 0x1" ::); +#else + uint32_t pcmr = 1; + asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); +#endif } static inline void ccount_dis(){ +#ifdef CV32E40X asm volatile("csrrsi zero, 0x320, 0x1" ::); +#else + uint32_t pcmr = 0; + asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); +#endif } static inline uint32_t get_cyclel(){ uint32_t cyclel; +#ifdef CV32E40X asm volatile("csrr %0, cycle" :"=r"(cyclel):); +#else + asm volatile("csrr %0, 0x780" + :"=r"(cyclel):); +#endif return cyclel; } @@ -142,8 +168,13 @@ uint32_t get_cycle(){ static inline uint32_t get_timel(){ uint32_t timel; +#ifdef CV32E40X asm volatile("csrr %0, time" :"=r"(timel):); +#else + asm volatile("csrr %0, 0x781" + :"=r"(timel):); +#endif return timel; } @@ -161,4 +192,66 @@ uint32_t get_time(){ return timel; } +// Additional Flex-V CSR access functions based on CSR table +static inline uint32_t get_mstatus(){ + uint32_t mstatus; + asm volatile("csrr %0, 0x300" :"=r"(mstatus):); // MSTATUS (0x300) + return mstatus; +} + +static inline void set_mstatus(uint32_t value){ + asm volatile("csrw 0x300, %0" ::"r"(value)); // MSTATUS (0x300) +} + +static inline uint32_t get_mtvec(){ + uint32_t mtvec; + asm volatile("csrr %0, 0x305" :"=r"(mtvec):); // MTVEC (0x305) + return mtvec; +} + +static inline void set_mtvec(uint32_t value){ + asm volatile("csrw 0x305, %0" ::"r"(value)); // MTVEC (0x305) +} + +static inline uint32_t get_mepc(){ + uint32_t mepc; + asm volatile("csrr %0, 0x341" :"=r"(mepc):); // MEPC (0x341) + return mepc; +} + +static inline void set_mepc(uint32_t value){ + asm volatile("csrw 0x341, %0" ::"r"(value)); // MEPC (0x341) +} + +static inline uint32_t get_mcause(){ + uint32_t mcause; + asm volatile("csrr %0, 0x342" :"=r"(mcause):); // MCAUSE (0x342) + return mcause; +} + +static inline uint32_t get_privlv(){ + uint32_t privlv; + asm volatile("csrr %0, 0xc10" :"=r"(privlv):); // PRIVLV (0xC10) + return privlv; +} + +static inline uint32_t get_uhartid(){ + uint32_t uhartid; + asm volatile("csrr %0, 0x014" :"=r"(uhartid):); // UHARTID (0x014) + return uhartid; +} + +// Flex-V performance counter control +static inline void perf_counter_enable(){ + uint32_t pcer = 3; // Enable cycles (bit 0) and instruction count (bit 1) + uint32_t pcmr = 1; // Enable global performance counter + asm volatile("csrw 0x7e0, %0" ::"r"(pcer)); // PCER_MACHINE (0x7E0) + asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1) +} + +static inline void perf_counter_disable(){ + uint32_t pcmr = 0; // Disable global performance counter + asm volatile("csrw 0x7e1, %0" ::"r"(pcmr)); // PCMR_MACHINE (0x7E1) +} + #endif /*MAGIA_TILE_UTILS_H*/ diff --git a/sw/utils/magia_utils.h b/sw/utils/magia_utils.h index fc21b05..9443c9f 100644 --- a/sw/utils/magia_utils.h +++ b/sw/utils/magia_utils.h @@ -26,7 +26,7 @@ #define STR_OFFSET (0x00000000) #define STR_BASE (RESERVED_START + STR_OFFSET) -#define SYNC_OFFSET (0x0000F000) +#define SYNC_OFFSET (0x0000D800) #define SYNC_BASE (RESERVED_START + SYNC_OFFSET) #define SYNC_EN (SYNC_BASE + 0x4) diff --git a/sw/utils/redmule_mm_utils.h b/sw/utils/redmule_mm_utils.h new file mode 100644 index 0000000..b39468c --- /dev/null +++ b/sw/utils/redmule_mm_utils.h @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2023-2024 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * SPDX-License-Identifier: Apache-2.0 + * + * + * RedMulE MM Utilities + * + * This header contains MM-related definitions and functions for RedMulE control + * using MMIO-based register access. + */ + +#ifndef REDMULE_MM_UTILS_H +#define REDMULE_MM_UTILS_H + +#include +#include "magia_tile_utils.h" + +/* OBI2HWPE Protocol Stamps */ +#define REDMULE_ADDR_BASE REDMULE_BASE + +#define HWPE_WRITE(value, offset) *(volatile int *)(REDMULE_ADDR_BASE + offset) = value +#define HWPE_READ(offset) *(volatile int *)(REDMULE_ADDR_BASE + offset) + +/* Register offsets (RedMulE hwpe-ctrl) */ +#define REDMULE_REG_OFFS 0x00 +#define REDMULE_TRIGGER 0x00 +#define REDMULE_ACQUIRE 0x04 +#define REDMULE_EVT_ENABLE 0x08 +#define REDMULE_STATUS 0x0C +#define REDMULE_RUNNING_JOB 0x10 +#define REDMULE_SOFT_CLEAR 0x14 + +/* RedMulE configuration registers */ +#define REDMULE_REG_X_PTR 0x40 +#define REDMULE_REG_W_PTR 0x44 +#define REDMULE_REG_Z_PTR 0x48 +#define REDMULE_MCFG0_PTR 0x4C +#define REDMULE_MCFG1_PTR 0x50 +#define REDMULE_ARITH_PTR 0x54 + +/* Operations and formats */ +#define gemm_ops 0x1 +#define Float16 0x1 +#define Float16Alt 0x2 +#define Float8 0x3 +#define Float8Alt 0x4 + +/* HWPE Register Access Functions */ +static inline void redmule_x_add_set(unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_X_PTR); +} + +static inline void redmule_w_add_set(unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_W_PTR); +} + +static inline void redmule_z_add_set(unsigned int value) { + HWPE_WRITE(value, REDMULE_REG_OFFS + REDMULE_REG_Z_PTR); +} + +static inline void redmule_mcfg_set(uint32_t mcfg0, uint32_t mcfg1) { + HWPE_WRITE(mcfg0, REDMULE_REG_OFFS + REDMULE_MCFG0_PTR); + HWPE_WRITE(mcfg1, REDMULE_REG_OFFS + REDMULE_MCFG1_PTR); +} + +static inline void redmule_arith_set(uint32_t arith) { + HWPE_WRITE(arith, REDMULE_REG_OFFS + REDMULE_ARITH_PTR); +} + +static inline void hwpe_trigger_job() { + HWPE_WRITE(0, REDMULE_TRIGGER); +} + +static inline int hwpe_acquire_job() { + int result = HWPE_READ(REDMULE_ACQUIRE); + return result; +} + +static inline unsigned int hwpe_get_status() { + unsigned int result = HWPE_READ(REDMULE_STATUS); + return result; +} + +static inline void hwpe_soft_clear() { + HWPE_WRITE(0, REDMULE_SOFT_CLEAR); +} + +static inline void hwpe_cg_enable() { + return; +} + +static inline void hwpe_cg_disable() { + return; +} + +static inline void hwpe_wait_for_completion() { + // Polling-based completion detection + unsigned int status; + unsigned int poll_count = 0; + unsigned int max_polls = 100000; + + do { + status = hwpe_get_status(); + poll_count++; + + // Small pause to not overload system + if (poll_count % 50 == 0) { + wait_nop(10); + } + + // Exit conditions: idle status (0) or timeout + if (status == 0 || poll_count >= max_polls) { + break; + } + + } while (1); +} + +/* RedMulE Configuration Function */ +static inline void redmule_cfg(unsigned int x, unsigned int w, unsigned int z, uint16_t m_size, uint16_t n_size, + uint16_t k_size, uint8_t gemm_op, uint8_t gemm_fmt) { + + uint32_t mcfg_reg0 = (k_size << 16) | (m_size << 0); + uint32_t mcfg_reg1 = n_size << 0; + uint32_t arith_reg = (gemm_op << 10) | (gemm_fmt << 7); + + redmule_x_add_set((unsigned int)x); + redmule_w_add_set((unsigned int)w); + redmule_z_add_set((unsigned int)z); + redmule_mcfg_set((unsigned int)mcfg_reg0, (unsigned int)mcfg_reg1); + redmule_arith_set((unsigned int)arith_reg); +} + +#endif /* REDMULE_MM_UTILS_H */ \ No newline at end of file diff --git a/target/sim/src/mesh/magia_vip.sv b/target/sim/src/mesh/magia_vip.sv index b430d17..5d3ca30 100644 --- a/target/sim/src/mesh/magia_vip.sv +++ b/target/sim/src/mesh/magia_vip.sv @@ -257,8 +257,12 @@ module magia_vip time last_start_id = 0, last_end_id = 0, sync_time_id; int unsigned completed_syncs_id = 0; for (genvar i = 0; i < magia_tb_pkg::N_TILES_Y; i++) begin: gen_tile_instr_monitor_y - for (genvar j = 0; j < magia_tb_pkg::N_TILES_X; j++) begin: gen_tile_instr_monitor_x + for (genvar j = 0; j < magia_tb_pkg::N_TILES_X; j++) begin: gen_tile_instr_monitor_x +`ifdef CV32E40X assign curr_instr_ex[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.id_stage_i.id_ex_pipe_o.instr.bus_resp.rdata; +`else + assign curr_instr_ex[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.ex_valid ? i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.id_stage_i.instr_rdata_i : '0; +`endif always @(curr_instr_ex[i*magia_tb_pkg::N_TILES_X+j]) begin: instr_ex_reporter if (curr_instr_ex[i*magia_tb_pkg::N_TILES_X+j] == 32'h50500013) $display("[TB][mhartid %0d - Tile (%0d, %0d)] detected sentinel instruction in EX stage at time %0dns", i*magia_tb_pkg::N_TILES_X+j, i, j, time_var); @@ -274,7 +278,11 @@ module magia_vip completed_syncs_ex++; end end +`ifdef CV32E40X assign curr_instr_id[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.id_stage_i.if_id_pipe_i.instr.bus_resp.rdata; +`else + assign curr_instr_id[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.id_stage_i.instr_rdata_i; +`endif always @(curr_instr_id[i*magia_tb_pkg::N_TILES_X+j]) begin: instr_id_reporter if (curr_instr_id[i*magia_tb_pkg::N_TILES_X+j] == 32'h40400013) $display("[TB][mhartid %0d - Tile (%0d, %0d)] detected sentinel instruction in ID stage at time %0dns", i*magia_tb_pkg::N_TILES_X+j, i, j, time_var); @@ -320,8 +328,13 @@ module magia_vip int unsigned sync_iteration = 0; for (genvar i = 0; i < magia_tb_pkg::N_TILES_Y; i++) begin: gen_tile_instr_monitor_y for (genvar j = 0; j < magia_tb_pkg::N_TILES_X; j++) begin: gen_tile_instr_monitor_x +`ifdef CV32E40X assign curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr_valid ? i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr.bus_resp.rdata : '0; +`else + assign curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.wb_valid ? + i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.regfile_wdata : '0; +`endif always @(curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j]) begin: instr_wb_reporter if (curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] == 32'h5AA00013) begin start_sentinel[i*magia_tb_pkg::N_TILES_X+j].push_back($time); @@ -475,8 +488,13 @@ module magia_vip time sentinel_latency[magia_tb_pkg::N_TILES]; for (genvar i = 0; i < magia_tb_pkg::N_TILES_Y; i++) begin: gen_tile_instr_monitor_y for (genvar j = 0; j < magia_tb_pkg::N_TILES_X; j++) begin: gen_tile_instr_monitor_x +`ifdef CV32E40X assign curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr_valid ? i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr.bus_resp.rdata : '0; +`else + assign curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] = i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.wb_valid ? + i_magia.gen_y_tile[i].gen_x_tile[j].i_magia_tile.i_cv32e40p_core.regfile_wdata : '0; +`endif always @(curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j]) begin: instr_wb_reporter if (curr_instr_wb[i*magia_tb_pkg::N_TILES_X+j] == 32'h5AA00013) begin start_sentinel[i*magia_tb_pkg::N_TILES_X+j].push_back($time); diff --git a/target/sim/src/tile/magia_tile_fixture.sv b/target/sim/src/tile/magia_tile_fixture.sv index ec782c7..6a8810f 100644 --- a/target/sim/src/tile/magia_tile_fixture.sv +++ b/target/sim/src/tile/magia_tile_fixture.sv @@ -159,10 +159,13 @@ module magia_tile_fixture; .core_sleep_o ( core_sleep ), .wu_wfe_i ( wu_wfe ) ); - `ifdef CORE_TRACES - localparam string core_trace_file_name = "log_file_0"; - defparam i_magia_tile.i_cv32e40x_core.rvfi_i.tracer_i.LOGFILE_PATH_PLUSARG = core_trace_file_name; - `endif +`ifdef CORE_TRACES +`ifdef CV32E40X + localparam string core_trace_file_name = "log_file_0"; + defparam i_magia_tile.i_cv32e40x_core.rvfi_i.tracer_i.LOGFILE_PATH_PLUSARG = core_trace_file_name; +`endif + // Note: cv32e40p tracer generates its own filename: trace_core_{cluster_id}_{core_id}.log +`endif /*******************************************************/ /** DUT End **/ diff --git a/target/sim/src/tile/magia_tile_vip.sv b/target/sim/src/tile/magia_tile_vip.sv index d30e36c..b14f13c 100644 --- a/target/sim/src/tile/magia_tile_vip.sv +++ b/target/sim/src/tile/magia_tile_vip.sv @@ -245,8 +245,12 @@ end /*******************************************************/ `ifdef PROFILE_DETAILED - bit[31:0] curr_instr; + bit[31:0] curr_instr; +`ifdef CV32E40X assign curr_instr = i_magia_tile.i_cv32e40x_core.core_i.if_stage_i.if_id_pipe_o.instr.bus_resp.rdata; +`else + assign curr_instr = i_magia_tile.i_cv32e40p_core.id_stage_i.instr_rdata_i; +`endif always @(curr_instr) begin: instr_reporter if (curr_instr == 32'h50500013) $display("[TB] detected sentinel instruction at time %0dns", time_var); end @@ -258,8 +262,13 @@ end time start_sentinel[$]; time end_sentinel[$]; time sentinel_latency; +`ifdef CV32E40X assign curr_instr_wb = i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr_valid ? i_magia_tile.i_cv32e40x_core.core_i.wb_stage_i.ex_wb_pipe_i.instr.bus_resp.rdata : '0; +`else + assign curr_instr_wb = i_magia_tile.i_cv32e40p_core.wb_valid ? + i_magia_tile.i_cv32e40p_core.regfile_wdata : '0; +`endif always @(curr_instr_wb) begin: instr_wb_reporter if (curr_instr_wb == 32'h5AA00013) begin start_sentinel.push_back($time); @@ -284,4 +293,4 @@ end /** Instruction Monitor End **/ /*******************************************************/ -endmodule: magia_tile_vip +endmodule: magia_tile_vip \ No newline at end of file