diff --git a/.github/scripts/defaults.sh b/.github/scripts/defaults.sh index ea90bbaee7..dfecfa5881 100755 --- a/.github/scripts/defaults.sh +++ b/.github/scripts/defaults.sh @@ -27,7 +27,7 @@ REMOTE_COURSIER_CACHE=$REMOTE_WORK_DIR/.coursier-cache declare -A grouping grouping["group-cores"]="chipyard-cva6 chipyard-ibex chipyard-rocket chipyard-hetero chipyard-boomv3 chipyard-boomv4 chipyard-sodor chipyard-digitaltop chipyard-multiclock-rocket chipyard-nomem-scratchpad chipyard-spike chipyard-clone chipyard-prefetchers chipyard-shuttle" grouping["group-peripherals"]="chipyard-dmirocket chipyard-dmiboomv3 chipyard-dmiboomv4 chipyard-spiflashwrite chipyard-mmios chipyard-nocores chipyard-manyperipherals chipyard-chiplike chipyard-tethered chipyard-symmetric chipyard-llcchiplet" -grouping["group-accels"]="chipyard-compressacc chipyard-mempress chipyard-gemmini chipyard-manymmioaccels chipyard-nvdla chipyard-aes256ecb chipyard-rerocc chipyard-rocketvector chipyard-shuttlevector chipyard-shuttleara" +grouping["group-accels"]="chipyard-compressacc chipyard-mempress chipyard-gemmini chipyard-manymmioaccels chipyard-nvdla chipyard-aes256ecb chipyard-rerocc chipyard-rocketvector chipyard-shuttlevector chipyard-shuttleara chipyard-hlsacc" grouping["group-constellation"]="chipyard-constellation" grouping["group-tracegen"]="tracegen tracegen-boomv3 tracegen-boomv4" grouping["group-other"]="icenet testchipip constellation rocketchip-amba rocketchip-tlsimple rocketchip-tlwidth rocketchip-tlxbar chipyard-clusters" @@ -41,6 +41,7 @@ mapping["chipyard-rocket"]=" CONFIG=QuadChannelRocketConfig" mapping["chipyard-dmirocket"]=" CONFIG=dmiRocketConfig" mapping["chipyard-mempress"]=" CONFIG=MempressRocketConfig" mapping["chipyard-compressacc"]=" CONFIG=ZstdCompressorRocketConfig" +mapping["chipyard-hlsacc"]=" CONFIG=GCDHLSRocketConfig" mapping["chipyard-prefetchers"]=" CONFIG=PrefetchingRocketConfig" mapping["chipyard-digitaltop"]=" TOP=DigitalTop" mapping["chipyard-manymmioaccels"]=" CONFIG=ManyMMIOAcceleratorRocketConfig" diff --git a/docs/Customization/Incorporating-HLS.rst b/docs/Customization/Incorporating-HLS.rst new file mode 100644 index 0000000000..4351946899 --- /dev/null +++ b/docs/Customization/Incorporating-HLS.rst @@ -0,0 +1,82 @@ +.. _incorporating-hls: + +Incorporating HLS +============================ + +High Level Synthesis (HLS) is a method for iterating quickly on +different hardware algorithms that automatically generates an RTL +circuit to match a specification in a high level language like C. + +Here, we will integrate an HLS-generated accelerator that computes +the Great Common Denominator (GCD) of two integers. This tutorial +builds on the sections :ref:`mmio-accelerators` and +:ref:`incorporating-verilog-blocks`. + +Adding an HLS project +--------------------------------------- + +In this tutorial, we use Vitis HLS. The user guide for this tool +can be found at https://docs.amd.com/r/en-US/ug1399-vitis-hls. + +Our project consists of 3 HLS files: +* C program of the GCD algorithm: :gh-file-ref:`generators/chipyard/src/main/resources/hls/HLSAccel.cpp` +* TCL script to run Vitis HLS: :gh-file-ref:`generators/chipyard/src/main/resources/hls/run_hls.tcl` +* Makefile to run HLS and move verilog files: :gh-file-ref:`generators/chipyard/src/main/resources/hls/Makefile` + +This example implements an iterative GCD algorithm, which is manually connected to +a TileLink register node in the ``HLSGCDAccel`` class in +:gh-file-ref:`generators/chipyard/src/main/scala/example/GCD.scala`. +HLS also supports adding AXI nodes to accelerators using compiler directives and +the HLS stream library. See the Vitis HLS user guide for AXI implementation information. + +The HLS code is synthesized for a particular FPGA target, in this case, +an AMD Alveo U200. The target FPGA part is specified in ``run_hls.tcl`` using +the ``set_part command``. The clock period, used for design optimization purposes, +is also set in ``run_hls.tcl`` using the ``create_clock`` command. + +To generate the verilog files, as well as synthesis reports, run: + +.. code-block:: none + + vitis_hls run_hls.tcl + +The files can be found in a generated folder named proj\_\, +in our case, ``proj_gcd_example``. + +In our case, we include a ``Makefile`` to run HLS and to move files to +their intended locations. To generate the verilog files using the Makefile, run: + +.. code-block:: none + + make + +To delete the generated files, run: + +.. code-block:: none + + make clean + +Creating the Verilog black box +--------------------------------------- + +.. Note:: This section discusses automatically running HLS within a Verilog black box. Please consult :ref:`incorporating-verilog-blocks` for background information on writing a Verilog black box. + +We use Scala to run ``make``, which runs HLS and copies the files into :gh-file-ref:`generators/chipyard/src/main/resources/vsrc`. +Then, we add the path to each file. This code will execute during Chisel elaboration, conveniently handling +file generation for the user. + +.. literalinclude:: ../../generators/chipyard/src/main/scala/example/GCD.scala + :language: scala + :start-after: DOC include start: HLS blackbox + :end-before: DOC include end: HLS blackbox + +Running the example +--------------------------------------- + +To test if the accelerator works, use the test program in :gh-file-ref:`tests/gcd.c`. +Compile the program with ``make``. Then, run: + +.. code-block:: none + + cd sims/vcs + make run-binary CONFIG=HLSAcceleratorRocketConfig BINARY=../../tests/gcd.riscv \ No newline at end of file diff --git a/docs/Customization/index.rst b/docs/Customization/index.rst index 9656efe654..b74135f7c7 100644 --- a/docs/Customization/index.rst +++ b/docs/Customization/index.rst @@ -46,6 +46,7 @@ We recommend reading all these pages in order. Hit next to get started! Keys-Traits-Configs DMA-Devices Incorporating-Verilog-Blocks + Incorporating-HLS Memory-Hierarchy Boot-Process IOBinders diff --git a/generators/chipyard/src/main/resources/hls/HLSAccel.cpp b/generators/chipyard/src/main/resources/hls/HLSAccel.cpp new file mode 100644 index 0000000000..41e58a4a5f --- /dev/null +++ b/generators/chipyard/src/main/resources/hls/HLSAccel.cpp @@ -0,0 +1,28 @@ +#ifndef _GCD_EX_H_ +#define _GCD_EX_H_ + +#include + +#define DATA_WIDTH 32 + +typedef ap_uint io_t; + +io_t HLSGCDAccelBlackBox(io_t x, io_t y) { + io_t tmp; + io_t gcd; + + tmp = y; + gcd = x; + + while(tmp != 0) { + if (gcd > tmp) { + gcd = gcd - tmp; + } else { + tmp = tmp - gcd; + } + } + + return gcd; +} + +#endif \ No newline at end of file diff --git a/generators/chipyard/src/main/resources/hls/Makefile b/generators/chipyard/src/main/resources/hls/Makefile new file mode 100644 index 0000000000..01ba225b01 --- /dev/null +++ b/generators/chipyard/src/main/resources/hls/Makefile @@ -0,0 +1,21 @@ +base_dir=$(abspath ../../../..) +hls_dir=$(abspath .) +hls_vlog_gendir=$(hls_dir)/proj_gcd_example/solution1/syn/verilog +vsrc_dir=$(base_dir)/src/main/resources/vsrc + +.PHONY: default run-hls clean + +HLS_CMD = vitis_hls +TCL_SCRIPT = run_hls.tcl +ACCEL_C = HLSAccel.cpp + +default: run-hls + +run-hls: $(ACCEL_C) $(TCL_SCRIPT) + $(HLS_CMD) $(TCL_SCRIPT) + cp -r $(hls_vlog_gendir)/. $(vsrc_dir) + +clean: + rm -rf $(hls_dir)/proj_gcd_example + rm -f $(hls_dir)/vitis_hls.log + rm -f $(vsrc_dir)/HLSGCDAccelBlackBox* \ No newline at end of file diff --git a/generators/chipyard/src/main/resources/hls/run_hls.tcl b/generators/chipyard/src/main/resources/hls/run_hls.tcl new file mode 100644 index 0000000000..7b3dd9cd0b --- /dev/null +++ b/generators/chipyard/src/main/resources/hls/run_hls.tcl @@ -0,0 +1,11 @@ +open_project -reset proj_gcd_example +add_files HLSAccel.cpp +set_top HLSGCDAccelBlackBox +open_solution -reset "solution1" + +# Specify FPGA board and clock frequency +set_part {xcu200-fsgd2104-2-e} +create_clock -period 10 + +csynth_design +exit \ No newline at end of file diff --git a/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala b/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala index 30a180439f..3c8ca45a90 100644 --- a/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala +++ b/generators/chipyard/src/main/scala/config/MMIOAcceleratorConfigs.scala @@ -28,6 +28,11 @@ class GCDAXI4BlackBoxRocketConfig extends Config( new chipyard.config.AbstractConfig) // DOC include end: GCDAXI4BlackBoxRocketConfig +class GCDHLSRocketConfig extends Config( + new chipyard.example.WithGCD(useAXI4=false, useBlackBox=false, useHLS=true) ++ + new freechips.rocketchip.rocket.WithNHugeCores(1) ++ + new chipyard.config.AbstractConfig) + // DOC include start: InitZeroRocketConfig class InitZeroRocketConfig extends Config( new chipyard.example.WithInitZero(0x88000000L, 0x1000L) ++ // add InitZero diff --git a/generators/chipyard/src/main/scala/example/GCD.scala b/generators/chipyard/src/main/scala/example/GCD.scala index 7fdb171443..94c434b789 100644 --- a/generators/chipyard/src/main/scala/example/GCD.scala +++ b/generators/chipyard/src/main/scala/example/GCD.scala @@ -1,5 +1,7 @@ package chipyard.example +import sys.process._ + import chisel3._ import chisel3.util._ import chisel3.experimental.{IntParam, BaseModule} @@ -17,7 +19,8 @@ case class GCDParams( address: BigInt = 0x4000, width: Int = 32, useAXI4: Boolean = false, - useBlackBox: Boolean = true) + useBlackBox: Boolean = true, + useHLS: Boolean = false) // DOC include end: GCD params // DOC include start: GCD key @@ -37,6 +40,18 @@ class GCDIO(val w: Int) extends Bundle { val busy = Output(Bool()) } +class HLSGCDAccelIO(val w: Int) extends Bundle { + val ap_clk = Input(Clock()) + val ap_rst = Input(Reset()) + val ap_start = Input(Bool()) + val ap_done = Output(Bool()) + val ap_idle = Output(Bool()) + val ap_ready = Output(Bool()) + val x = Input(UInt(w.W)) + val y = Input(UInt(w.W)) + val ap_return = Output(UInt(w.W)) +} + class GCDTopIO extends Bundle { val gcd_busy = Output(Bool()) } @@ -88,6 +103,23 @@ class GCDMMIOChiselModule(val w: Int) extends Module { } // DOC include end: GCD chisel +// DOC include start: HLS blackbox +class HLSGCDAccelBlackBox(val w: Int) extends BlackBox with HasBlackBoxPath { + val io = IO(new HLSGCDAccelIO(w)) + + val chipyardDir = System.getProperty("user.dir") + val hlsDir = s"$chipyardDir/generators/chipyard" + + // Run HLS command + val make = s"make -C ${hlsDir}/src/main/resources/hls default" + require (make.! == 0, "Failed to run HLS") + + // Add each vlog file + addPath(s"$hlsDir/src/main/resources/vsrc/HLSGCDAccelBlackBox.v") + addPath(s"$hlsDir/src/main/resources/vsrc/HLSGCDAccelBlackBox_flow_control_loop_pipe.v") +} +// DOC include end: HLS blackbox + // DOC include start: GCD router class GCDTL(params: GCDParams, beatBytes: Int)(implicit p: Parameters) extends ClockSinkDomain(ClockSinkParameters())(p) { val device = new SimpleDevice("gcd", Seq("ucbbar,gcd")) @@ -190,6 +222,64 @@ class GCDAXI4(params: GCDParams, beatBytes: Int)(implicit p: Parameters) extends } // DOC include end: GCD router +class HLSGCDAccel(params: GCDParams, beatBytes: Int)(implicit p: Parameters) extends ClockSinkDomain(ClockSinkParameters())(p) { + val device = new SimpleDevice("hlsgcdaccel", Seq("ucbbar,hlsgcdaccel")) + val node = TLRegisterNode(Seq(AddressSet(params.address, 4096-1)), device, "reg/control", beatBytes=beatBytes) + + override lazy val module = new HLSGCDAccelImpl + class HLSGCDAccelImpl extends Impl with HasGCDTopIO { + val io = IO(new GCDTopIO) + withClockAndReset(clock, reset) { + val x = Reg(UInt(params.width.W)) + val y = Wire(new DecoupledIO(UInt(params.width.W))) + val y_reg = Reg(UInt(params.width.W)) + val gcd = Wire(new DecoupledIO(UInt(params.width.W))) + val gcd_reg = Reg(UInt(params.width.W)) + val status = Wire(UInt(2.W)) + + val impl = Module(new HLSGCDAccelBlackBox(params.width)) + + impl.io.ap_clk := clock + impl.io.ap_rst := reset + + val s_idle :: s_busy :: Nil = Enum(2) + val state = RegInit(s_idle) + val result_valid = RegInit(false.B) + when (state === s_idle && y.valid) { + state := s_busy + result_valid := false.B + y_reg := y.bits + } .elsewhen (state === s_busy && impl.io.ap_done) { + state := s_idle + result_valid := true.B + gcd_reg := impl.io.ap_return + } + + impl.io.ap_start := state === s_busy + + gcd.valid := result_valid + status := Cat(impl.io.ap_idle, result_valid) + + impl.io.x := x + impl.io.y := y_reg + y.ready := impl.io.ap_idle + gcd.bits := gcd_reg + + io.gcd_busy := !impl.io.ap_idle + + node.regmap( + 0x00 -> Seq( + RegField.r(2, status)), // a read-only register capturing current status + 0x04 -> Seq( + RegField.w(params.width, x)), // a plain, write-only register + 0x08 -> Seq( + RegField.w(params.width, y)), // write-only, y.valid is set on write + 0x0C -> Seq( + RegField.r(params.width, gcd))) // read-only, gcd.ready is set on read + } + } +} + // DOC include start: GCD lazy trait trait CanHavePeripheryGCD { this: BaseSubsystem => private val portName = "gcd" @@ -210,6 +300,11 @@ trait CanHavePeripheryGCD { this: BaseSubsystem => TLFragmenter(pbus.beatBytes, pbus.blockBytes, holdFirstDeny = true) := _ } gcd + } else if (params.useHLS) { + val gcd = LazyModule(new HLSGCDAccel(params, pbus.beatBytes)(p)) + gcd.clockNode := pbus.fixedClockNode + pbus.coupleTo(portName) { gcd.node := TLFragmenter(pbus.beatBytes, pbus.blockBytes) := _ } + gcd } else { val gcd = LazyModule(new GCDTL(params, pbus.beatBytes)(p)) gcd.clockNode := pbus.fixedClockNode @@ -229,7 +324,11 @@ trait CanHavePeripheryGCD { this: BaseSubsystem => // DOC include end: GCD lazy trait // DOC include start: GCD config fragment -class WithGCD(useAXI4: Boolean = false, useBlackBox: Boolean = false) extends Config((site, here, up) => { - case GCDKey => Some(GCDParams(useAXI4 = useAXI4, useBlackBox = useBlackBox)) +class WithGCD(useAXI4: Boolean = false, useBlackBox: Boolean = false, useHLS: Boolean = false) extends Config((site, here, up) => { + case GCDKey => { + // useHLS cannot be used with useAXI4 and useBlackBox + assert(!useHLS || (useHLS && !useAXI4 && !useBlackBox)) + Some(GCDParams(useAXI4 = useAXI4, useBlackBox = useBlackBox, useHLS = useHLS)) + } }) // DOC include end: GCD config fragment