From b17eadbe7e245526fe5f50733fca11f9e79da6d9 Mon Sep 17 00:00:00 2001 From: Lucas-Wye Date: Fri, 9 Aug 2024 08:21:21 +0000 Subject: [PATCH] [rtl] support zvk --- configgen/generated/blastoise.json | 3 +- configgen/generated/machamp.json | 3 +- configgen/generated/psyduck.json | 20 +- configgen/generated/sandslash.json | 3 +- configgen/src/Main.scala | 17 +- ipemu/src/TestBench.scala | 2 +- t1/src/Bundles.scala | 3 + t1/src/Lane.scala | 300 ++++++++- t1/src/LaneZvk.scala | 624 ++++++++++++++++++ t1/src/LaneZvk256.scala | 192 ++++++ t1/src/T1.scala | 69 +- t1/src/VectorFunctionUnit.scala | 6 +- t1/src/decoder/Decoder.scala | 40 +- t1/src/decoder/InstructionDocumentation.scala | 26 + t1/src/decoder/T1DecodePattern.scala | 3 + t1/src/decoder/attribute/isItype.scala | 9 +- t1/src/decoder/attribute/isVtype.scala | 18 +- t1/src/decoder/attribute/isZvk.scala | 57 ++ t1/src/decoder/attribute/isZvk128.scala | 55 ++ t1/src/decoder/attribute/isZvk256.scala | 38 ++ t1/src/decoder/attribute/zvkUop.scala | 136 ++++ t1/src/laneStage/LaneExecutionBridge.scala | 63 +- t1/src/laneStage/LaneStage1.scala | 310 ++++++++- t1/src/laneStage/LaneStage3.scala | 57 +- t1/src/laneStage/SlotTokenManager.scala | 68 +- t1/src/laneStage/ZvkCrossReadUnit.scala | 133 ++++ t1/src/vrf/VRF.scala | 40 +- t1rocket/src/T1RocketTile.scala | 6 +- 28 files changed, 2192 insertions(+), 109 deletions(-) create mode 100644 t1/src/LaneZvk.scala create mode 100644 t1/src/LaneZvk256.scala create mode 100644 t1/src/decoder/attribute/isZvk.scala create mode 100644 t1/src/decoder/attribute/isZvk128.scala create mode 100644 t1/src/decoder/attribute/isZvk256.scala create mode 100644 t1/src/decoder/attribute/zvkUop.scala create mode 100644 t1/src/laneStage/ZvkCrossReadUnit.scala diff --git a/configgen/generated/blastoise.json b/configgen/generated/blastoise.json index 290ef86c1..88c465075 100644 --- a/configgen/generated/blastoise.json +++ b/configgen/generated/blastoise.json @@ -167,7 +167,8 @@ ] ] ], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/generated/machamp.json b/configgen/generated/machamp.json index ceeaf5e59..865f6c13a 100644 --- a/configgen/generated/machamp.json +++ b/configgen/generated/machamp.json @@ -151,7 +151,8 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/generated/psyduck.json b/configgen/generated/psyduck.json index 04a2f3572..6abca9424 100644 --- a/configgen/generated/psyduck.json +++ b/configgen/generated/psyduck.json @@ -4,7 +4,8 @@ "dLen": 256, "extensions": [ "Zve32f", - "Zvbb" + "Zvbb", + "Zvk" ], "t1customInstructions": [], "vrfBankSize": 1, @@ -184,6 +185,23 @@ 3 ] ] + ], + "zvkModuleParameters": [ + [ + { + "parameter": { + "datapathWidth": 32, + "latency": 3 + }, + "generator": "org.chipsalliance.t1.rtl.LaneZvk" + }, + [ + 0, + 1, + 2, + 3 + ] + ] ] } }, diff --git a/configgen/generated/sandslash.json b/configgen/generated/sandslash.json index 688085fe1..25f76a682 100644 --- a/configgen/generated/sandslash.json +++ b/configgen/generated/sandslash.json @@ -151,7 +151,8 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/src/Main.scala b/configgen/src/Main.scala index 1c4a468ba..5439a1bc1 100644 --- a/configgen/src/Main.scala +++ b/configgen/src/Main.scala @@ -102,14 +102,15 @@ object Main { ), floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), - zvbbModuleParameters = Seq() + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) param } - // DLEN256 VLEN256; FP; VRF p0rw,p1rw bank1; LSU bank8 beatbyte 8; Zvbb + // DLEN256 VLEN256; FP; VRF p0rw,p1rw bank1; LSU bank8 beatbyte 8; Zvbb; Zvk @main def psyduck( @arg(name = "target-file", short = 't') targetFile: os.Path, @arg(name = "emit", short = 'e', doc = "emit config") doEmit: Boolean = true @@ -119,7 +120,7 @@ object Main { val param = T1Parameter( vLen, dLen, - extensions = Seq("Zve32f", "Zvbb"), + extensions = Seq("Zve32f", "Zvbb", "Zvk"), t1customInstructions = Nil, vrfBankSize = 1, vrfRamType = RamType.p0rwp1rw, @@ -155,7 +156,9 @@ object Main { floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), zvbbModuleParameters = - Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))) + Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))), + zvkModuleParameters = + Seq((SerializableModuleGenerator(classOf[LaneZvk], LaneZvkParam(128, 3)), Seq(0, 1, 2, 3)), (SerializableModuleGenerator(classOf[LaneZvk256], LaneZvkParam(256, 3)), Seq(0, 1, 2, 3))), ) ) if (doEmit) param.emit(targetFile) @@ -207,7 +210,8 @@ object Main { ) ), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) @@ -259,7 +263,8 @@ object Main { ) ), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) diff --git a/ipemu/src/TestBench.scala b/ipemu/src/TestBench.scala index 318377932..00d9f7ee1 100644 --- a/ipemu/src/TestBench.scala +++ b/ipemu/src/TestBench.scala @@ -260,7 +260,7 @@ class TestBench(generator: SerializableModuleGenerator[T1, T1Parameter]) laneProbes.flatMap(laneProbe => laneProbe.slots.map(slot => slot.writeTag === tag.U && slot.writeQueueEnq && slot.writeMask.orR) ) ++ laneProbes.flatMap(laneProbe => - laneProbe.crossWriteProbe.map(cp => cp.bits.writeTag === tag.U && cp.valid && cp.bits.writeMask.orR) + laneProbe.crossWriteProbe.map(cp => cp.bits.writeTag === tag.U && cp.valid && cp.bits.writeMask.orR) // TODO: zvkCrossWriteProbe ) ++ // vrf write from lsu lsuProbe.slots.map(slot => slot.dataInstruction === tag.U && slot.writeValid && slot.dataMask.orR) ++ diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index fd833f07e..d3acd5221 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -591,8 +591,11 @@ class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends val maskForFilter: UInt = UInt(4.W) // false -> lsb of cross read group val executeIndex: Bool = Bool() + val zvkExecuteIndex: Option[UInt] = Option.when(parameter.zvkEnable)(UInt(2.W)) val source: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) /** groupCounter need use to update `Lane.maskFormatResultForGroup` */ val groupCounter: UInt = UInt(parameter.groupNumberBits.W) diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index e8b74ad00..4b3109bb5 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -63,7 +63,9 @@ class LaneProbe(parameter: LaneParameter) extends Bundle { val instructionFinished: UInt = UInt(parameter.chainingSize.W) val instructionValid: UInt = UInt(parameter.chainingSize.W) - val crossWriteProbe: Vec[ValidIO[LaneWriteProbe]] = Vec(2, Valid(new LaneWriteProbe(parameter.instructionIndexBits))) + val crossWriteProbe: Vec[ValidIO[LaneWriteProbe]] = Vec(2, Valid(new LaneWriteProbe(parameter.instructionIndexBits))) + val zvkCrossWriteProbe: Option[Vec[ValidIO[LaneWriteProbe]]] = + Option.when(parameter.zvkEnable)(Vec(4, Valid(new LaneWriteProbe(parameter.instructionIndexBits)))) val vrfProbe: VRFProbe = new VRFProbe(parameter.vrfParam) } @@ -86,16 +88,18 @@ object LaneParameter { * queue will be used for latch data from ring, in case of additional latency from ring. TODO: cover the queue full. */ case class LaneParameter( - vLen: Int, - datapathWidth: Int, - laneNumber: Int, - chainingSize: Int, - crossLaneVRFWriteEscapeQueueSize: Int, - fpuEnable: Boolean, - portFactor: Int, - vrfRamType: RamType, - decoderParam: DecoderParam, - vfuInstantiateParameter: VFUInstantiateParameter) + vLen: Int, + datapathWidth: Int, + laneNumber: Int, + chainingSize: Int, + crossLaneVRFWriteEscapeQueueSize: Int, + crossLaneVRFWriteEscapeZvkQueueSize: Int, + fpuEnable: Boolean, + zvkEnable: Boolean, + portFactor: Int, + vrfRamType: RamType, + decoderParam: DecoderParam, + vfuInstantiateParameter: VFUInstantiateParameter) extends SerializableModuleParameter { /** 1 in MSB for instruction order. */ @@ -135,7 +139,7 @@ case class LaneParameter( * * for each number in table below, it represent a [[datapathWidth]] * {{{ - * lane0 | lane1 | ... | lane8 + * lane0 | lane1 | ... | lane7 * offset0 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 * offset1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 * offset2 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 @@ -178,7 +182,7 @@ case class LaneParameter( val executionQueueSize: Int = 4 /** Parameter for [[VRF]] */ - def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, portFactor, vrfRamType) + def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, portFactor, zvkEnable, vrfRamType) } /** Instantiate [[Lane]] from [[T1]], @@ -211,14 +215,37 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** Cross lane VRF Read Interface. only used for `narrow` an `widen` TODO: benchmark the usecase for tuning the Ring * Bus width. find a real world case for using `narrow` and `widen` aggressively. */ + // 0: 0.0 - 0.1 + // 1: 0.2 - 0.3 + // 2: 0.4 - 0.5 + // 3: 0.6 - 0.7 + // 4: 1.0 - 1.1 + // 5: 1.2 - 1.3 + // 6: 1.4 - 1.5 + // 7: 1.6 - 1.7 + + // 0: 0.0 - 0.1 - 0.2 - 0.3 + // 1: 0.4 - 0.5 - 0.6 - 0.7 + // 2: 1.0 - 1.1 - 1.2 - 1.3 + // 3: 1.4 - 1.5 - 1.6 - 1.7 + // 4: 2.0 - 2.1 - 2.2 - 2.3 + // 5: 2.4 - 2.5 - 2.6 - 2.7 + // 6: 3.0 - 3.1 - 3.2 - 3.3 + // 7: 3.4 - 3.5 - 3.6 - 3.7 @public - val readBusPort: Vec[RingPort[ReadBusData]] = IO(Vec(2, new RingPort(new ReadBusData(parameter)))) + val readBusPort: Vec[RingPort[ReadBusData]] = IO(Vec(2, new RingPort(new ReadBusData(parameter)))) + @public + val zvkReadBusPort: Option[Vec[RingPort[ReadBusData]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new ReadBusData(parameter))))) /** VRF Write Interface. only used for `narrow` an `widen` TODO: benchmark the usecase for tuning the Ring Bus width. * find a real world case for using `narrow` and `widen` aggressively. */ @public - val writeBusPort: Vec[RingPort[WriteBusData]] = IO(Vec(2, new RingPort(new WriteBusData(parameter)))) + val writeBusPort: Vec[RingPort[WriteBusData]] = IO(Vec(2, new RingPort(new WriteBusData(parameter)))) + @public + val zvkWriteBusPort: Option[Vec[RingPort[WriteBusData]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new WriteBusData(parameter))))) /** request from [[T1.decode]] to [[Lane]]. */ @public @@ -299,7 +326,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val loadDataInLSUWriteQueue: UInt = IO(Input(UInt(parameter.chainingSize.W))) - /** How many dataPath will writ by instruction in this lane */ + /** How many dataPath will write by instruction in this lane */ @public val writeCount: UInt = IO(Input(UInt((parameter.vlMaxBits - log2Ceil(parameter.laneNumber) - log2Ceil(parameter.dataPathByteWidth)).W))) @@ -318,6 +345,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // TODO: remove dontTouch(writeBusPort) + if (parameter.zvkEnable) { + dontTouch(zvkWriteBusPort.get) + } /** VRF instantces. */ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) @@ -372,12 +402,24 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrfWriteArbiter(parameter.chainingSize).bits := topWriteQueue.bits topWriteQueue.ready := vrfWriteArbiter(parameter.chainingSize).ready - val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => + val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) } - val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) } - val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) - val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } + val zvkAllVrfWriteAfterCheck: Option[Seq[VRFWriteRequest]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize + 7) { i => + RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) + }) + val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B) } + val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) + val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map { case (v, r) => v && r } + + val zvkAfterCheckValid: Option[Seq[Bool]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize + 7) { _ => RegInit(false.B) }) + val zvkAfterCheckDequeueReady: Option[Vec[Bool]] = + Option.when(parameter.zvkEnable)(Wire(Vec(parameter.chainingSize + 7, Bool()))) + val zvkAfterCheckDequeueFire: Option[Seq[Bool]] = Option.when(parameter.zvkEnable)( + zvkAfterCheckValid.get.zip(zvkAfterCheckDequeueReady.get).map { case (v, r) => v && r } + ) /** for each slot, assert when it is asking [[T1]] to change mask */ val slotMaskRequestVec: Vec[ValidIO[UInt]] = Wire( @@ -431,14 +473,24 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) // 3 * slot + 2 cross read - val readCheckRequestVec: Vec[VRFReadRequest] = Wire( + val readCheckRequestVec: Vec[VRFReadRequest] = Wire( Vec( parameter.chainingSize * 3 + 2, new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) ) ) + val zvkReadCheckRequestVec: Option[Vec[VRFReadRequest]] = Option.when(parameter.zvkEnable)( + Wire( + Vec( + parameter.chainingSize * 3 + 4, + new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + ) + ) + ) - val readCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 2, Bool())) + val readCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 2, Bool())) + val zvkReadCheckResult: Option[Vec[Bool]] = + Option.when(parameter.zvkEnable)(Wire(Vec(parameter.chainingSize * 3 + 4, Bool()))) /** signal used for prohibiting slots to access VRF. a slot will become inactive when: * 1. cross lane read/write is not finished 2. lanes doesn't win mask request @@ -457,7 +509,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val slotCanShift: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) /** Which data group is waiting for the result of the cross-lane read */ - val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) + val readBusDequeueGroup: UInt = Wire( + UInt(parameter.groupNumberBits.W) + ) // TODO: readBusDequeueGroup is currently unused /** enqueue valid for execution unit */ val executeEnqueueValid: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) @@ -503,7 +557,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** queue for cross lane writing. TODO: benchmark the size of the queue */ - val crossLaneWriteQueue: Seq[Queue[VRFWriteRequest]] = Seq.tabulate(2)(i => + val crossLaneWriteQueue: Seq[Queue[VRFWriteRequest]] = Seq.tabulate(2)(i => Module( new Queue( new VRFWriteRequest( @@ -517,8 +571,24 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ ) ) ) - val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter)) - val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter)) + val zvkCrossLaneWriteQueue: Option[Seq[Queue[VRFWriteRequest]]] = Option.when(parameter.zvkEnable)( + Seq.tabulate(4)(i => + Module( + new Queue( + new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth + ), + parameter.crossLaneVRFWriteEscapeZvkQueueSize, + pipe = true + ) + ) + ) + ) + val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter)) + val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter)) // TODO: do we need to expose the slot to a module? class Slot(val record: InstructionControlRecord, val index: Int) { val decodeResult: DecodeBundle = record.laneRequest.decodeResult @@ -663,9 +733,19 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ readCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.vrfCheckRequest(portIndex) stage1.checkResult(portIndex) := readCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) } + val zvkCheckSize = if (isLastSlot && parameter.zvkEnable) 7 else 3 + if (parameter.zvkEnable) { + Seq.tabulate(zvkCheckSize) { portIndex => + zvkReadCheckRequestVec.get((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.zvkVrfCheckRequest + .get(portIndex) + stage1.zvkCheckResult.get(portIndex) := zvkReadCheckResult.get( + (parameter.chainingSize - index - 1) * 3 + portIndex + ) + } + } // connect cross read bus if (isLastSlot) { - val tokenSize = parameter.crossLaneVRFWriteEscapeQueueSize + val tokenSize = parameter.crossLaneVRFWriteEscapeQueueSize readBusPort.zipWithIndex.foreach { case (readPort, portIndex) => // tx val tokenReg = RegInit(0.U(log2Ceil(tokenSize + 1).W)) @@ -687,6 +767,30 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // dequeue to cross read unit stage1.readBusDequeue.get(portIndex) <> queue.io.deq } + val zvKTokenSize = parameter.crossLaneVRFWriteEscapeZvkQueueSize + if (parameter.zvkEnable) { + zvkReadBusPort.get.zipWithIndex.foreach { case (readPort, portIndex) => + // tx + val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= zvKTokenSize.U + stage1.zvkReadBusRequest.get(portIndex).ready := tokenReady + readPort.deq.valid := stage1.zvkReadBusRequest.get(portIndex).valid && tokenReady + readPort.deq.bits := stage1.zvkReadBusRequest.get(portIndex).bits + val tokenUpdate = Mux(readPort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(readPort.deq.valid ^ readPort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + // rx + // rx queue + val queue = Module(new Queue(chiselTypeOf(readPort.deq.bits), zvKTokenSize, pipe = true)) + queue.io.enq.valid := readPort.enq.valid + queue.io.enq.bits := readPort.enq.bits + readPort.enqRelease := queue.io.deq.fire + assert(queue.io.enq.ready || !readPort.enq.valid) + // dequeue to cross read unit + stage1.zvkReadBusDequeue.get(portIndex) <> queue.io.deq + } + } // cross write writeBusPort.zipWithIndex.foreach { case (writePort, portIndex) => @@ -702,6 +806,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenReg := tokenReg + tokenUpdate } } + if (parameter.zvkEnable) { + zvkWriteBusPort.get.zipWithIndex.foreach { case (writePort, portIndex) => + val tokenReg = RegInit(0.U(log2Ceil(tokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= tokenSize.U + writePort.deq.valid := stage3.zvkCrossWritePort.get(portIndex).valid && tokenReady + writePort.deq.bits := stage3.zvkCrossWritePort.get(portIndex).bits + stage3.zvkCrossWritePort.get(portIndex).ready := tokenReady + + // update token + val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(writePort.deq.valid ^ writePort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + } + } } stage2.enqueue.valid := stage1.dequeue.valid && executionUnit.enqueue.ready @@ -780,6 +899,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ stage3.enqueue.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) stage3.enqueue.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data) + if (parameter.zvkEnable) { + executionUnit.dequeue.bits.zvkCrossWriteData.foreach(data => stage3.enqueue.bits.zvkCrossWriteData.get := data) + } stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _) executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _) @@ -828,6 +950,24 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ assert(queue.io.enq.ready || !port.enq.valid) port.enqRelease := queue.io.deq.fire } + if (parameter.zvkEnable) { + zvkCrossLaneWriteQueue.get.zipWithIndex.foreach { case (queue, index) => + val port = zvkWriteBusPort.get(index) + // ((counter << 1) >> parameter.vrfParam.vrfOffsetBits).low(3) + val registerIncreaseBase = parameter.vrfParam.vrfOffsetBits - 1 + queue.io.enq.valid := port.enq.valid + queue.io.enq.bits.vd := + // 3: 8 reg => log(2, 8) + slotControl.head.laneRequest.vd + port.enq.bits.counter(registerIncreaseBase + 3 - 1, registerIncreaseBase) + queue.io.enq.bits.offset := port.enq.bits.counter ## index.U(2.W)(0) + queue.io.enq.bits.data := port.enq.bits.data + queue.io.enq.bits.last := DontCare + queue.io.enq.bits.instructionIndex := port.enq.bits.instructionIndex + queue.io.enq.bits.mask := FillInterleaved(2, port.enq.bits.mask) + assert(queue.io.enq.ready || !port.enq.valid) + port.enqRelease := queue.io.deq.fire + } + } val vfus: Seq[Instance[VFUModule]] = instantiateVFU(parameter.vfuInstantiateParameter)( requestVec, @@ -876,7 +1016,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } // all vrf write - val allVrfWrite: Seq[DecoupledIO[VRFWriteRequest]] = vrfWriteArbiter ++ crossLaneWriteQueue.map(_.io.deq) + val allVrfWrite: Seq[DecoupledIO[VRFWriteRequest]] = vrfWriteArbiter ++ crossLaneWriteQueue.map(_.io.deq) ++ { + if (parameter.zvkEnable) { + zvkCrossLaneWriteQueue.get.map(_.io.deq) + } else { + Seq() + } + } // check all write vrf.writeCheck.zip(allVrfWrite).foreach { case (check, write) => check.vd := write.bits.vd @@ -886,6 +1032,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.readCheck.zip(readCheckRequestVec).foreach { case (sink, source) => sink := source } readCheckResult.zip(vrf.readCheckResult).foreach { case (sink, source) => sink := source } + if (parameter.zvkEnable) { + vrf.zvkReadCheck.get.zip(zvkReadCheckRequestVec.get).foreach { case (sink, source) => sink := source } + zvkReadCheckResult.get.zip(vrf.zvkReadCheckResult.get).foreach { case (sink, source) => sink := source } + } allVrfWriteAfterCheck.zipWithIndex.foreach { case (req, i) => val check = vrf.writeAllow(i) @@ -900,16 +1050,59 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ afterCheckValid(i) := enqFire } } + if (parameter.zvkEnable) { + zvkAllVrfWriteAfterCheck.get.zipWithIndex.foreach { case (req, i) => + val check = { + if (i < 7) { + vrf.writeAllow(i) + } else { + false.B + } + } + val enqReady = check && (!zvkAfterCheckValid.get(i) || zvkAfterCheckDequeueReady.get(i)) + val enqFire = enqReady && allVrfWrite(i).valid + allVrfWrite(i).ready := enqReady + when(enqFire) { + req := allVrfWrite(i).bits + } + val deqFire = zvkAfterCheckDequeueFire.get(i) + when(deqFire ^ enqFire) { + zvkAfterCheckValid.get(i) := enqFire + } + } + } // Arbiter writeSelect := ffo(VecInit(afterCheckValid).asUInt & (~writeCavitation).asUInt) afterCheckDequeueReady.zipWithIndex.foreach { case (p, i) => p := (writeSelect(i) && queueBeforeMaskWrite.io.enq.ready) || writeCavitation(i) } + if (parameter.zvkEnable) { + zvkAfterCheckDequeueReady.get.zipWithIndex.foreach { case (p, i) => + p := { + if (i < 6) { + (writeSelect(i) && queueBeforeMaskWrite.io.enq.ready) || writeCavitation(i) + } else { + (queueBeforeMaskWrite.io.enq.ready) + } + } + } + } maskedWriteUnit.enqueue <> queueBeforeMaskWrite.io.deq queueBeforeMaskWrite.io.enq.valid := writeSelect.orR - queueBeforeMaskWrite.io.enq.bits := Mux1H(writeSelect, allVrfWriteAfterCheck) + + queueBeforeMaskWrite.io.enq.bits := { + // if(parameter.zvkEnable) { + // Mux( + // laneRequest.bits.decodeResult(Decoder.zvk), + // Mux1H(writeSelect, zvkAllVrfWriteAfterCheck.get), + // Mux1H(writeSelect, allVrfWriteAfterCheck) + // ) + // } else { + Mux1H(writeSelect, allVrfWriteAfterCheck) + // } + } // TODO vrf.write <> maskedWriteUnit.dequeue readBeforeMaskedWrite <> maskedWriteUnit.vrfReadRequest @@ -1167,6 +1360,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex) rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } + if (parameter.zvkEnable) { + tokenManager.zvkCrossWriteReports.get.zipWithIndex.foreach { case (rpt, rptIndex) => + rpt.valid := zvkAfterCheckDequeueFire.get(parameter.chainingSize + 1 + rptIndex) + rpt.bits := zvkAllVrfWriteAfterCheck.get(parameter.chainingSize + 1 + rptIndex).instructionIndex + } + } // todo: add mask unit write token tokenManager.responseReport.valid := laneResponse.valid tokenManager.responseReport.bits := laneResponse.bits.instructionIndex @@ -1187,8 +1386,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // slot write tokenManager.slotWriteReport.zipWithIndex.foreach { case (rpt, rptIndex) => // All masks are also removed here - rpt.valid := afterCheckDequeueFire(rptIndex) - rpt.bits := allVrfWriteAfterCheck(rptIndex).instructionIndex + if (parameter.zvkEnable) { + rpt.valid := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAfterCheckDequeueFire.get(rptIndex), + afterCheckDequeueFire(rptIndex) + ) + rpt.bits := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAllVrfWriteAfterCheck.get(rptIndex).instructionIndex, + allVrfWriteAfterCheck(rptIndex).instructionIndex + ) + } else { + rpt.valid := afterCheckDequeueFire(rptIndex) + rpt.bits := allVrfWriteAfterCheck(rptIndex).instructionIndex + } } tokenManager.writePipeEnqReport.valid := queueBeforeMaskWrite.io.enq.fire @@ -1201,8 +1413,22 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteEnq.valid := vrfWriteChannel.fire tokenManager.topWriteEnq.bits := vrfWriteChannel.bits.instructionIndex - tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) - tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + tokenManager.topWriteDeq.valid := { + if (parameter.zvkEnable) { + zvkAfterCheckDequeueFire.get(parameter.chainingSize) + } else { + afterCheckDequeueFire(parameter.chainingSize) + } + } + if (parameter.zvkEnable) { + tokenManager.topWriteDeq.bits := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAllVrfWriteAfterCheck.get(parameter.chainingSize).instructionIndex, + allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + ) + } else { + tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + } layer.block(layers.Verification) { val probeWire = Wire(new LaneProbe(parameter)) @@ -1241,7 +1467,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ pb.bits.writeTag := port.deq.bits.instructionIndex pb.bits.writeMask := port.deq.bits.mask } + if (parameter.zvkEnable) { + probeWire.zvkCrossWriteProbe.get.zip(zvkWriteBusPort.get).foreach { case (pb, port) => + pb.valid := port.deq.valid + pb.bits.writeTag := port.deq.bits.instructionIndex + pb.bits.writeMask := port.deq.bits.mask + } + } probeWire.vrfProbe := probe.read(vrf.vrfProbe) } - } diff --git a/t1/src/LaneZvk.scala b/t1/src/LaneZvk.scala new file mode 100644 index 000000000..1e05e09a7 --- /dev/null +++ b/t1/src/LaneZvk.scala @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3.experimental.hierarchy.instantiable +import chisel3._ +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.{BoolField, Decoder} + +object LaneZvkParam { + implicit def rw: upickle.default.ReadWriter[LaneZvkParam] = upickle.default.macroRW +} + +case class LaneZvkParam(datapathWidth: Int, latency: Int) extends VFUParameter with SerializableModuleParameter { + val inputBundle = new LaneZvkRequest(datapathWidth) // TODO: make `datapathWidth` as 128 bits + val decodeField: BoolField = Decoder.zvk128 + val outputBundle = new LaneZvkResponse(datapathWidth) + override val NeedSplit: Boolean = false +} + +class LaneZvkRequest(datapathWidth: Int) extends VFUPipeBundle { + val src = Vec(3, UInt(datapathWidth.W)) + val opcode = UInt(4.W) + val vSew = UInt(2.W) + // val shifterSize = UInt(log2Ceil(datapathWidth).W) +} + +class LaneZvkResponse(datapathWidth: Int) extends VFUPipeBundle { + val data = UInt(datapathWidth.W) +} + +@instantiable +class LaneZvk(val parameter: LaneZvkParam) extends VFUModule(parameter) with SerializableModule[LaneZvkParam] { + val response: LaneZvkResponse = Wire(new LaneZvkResponse(parameter.datapathWidth)) + val request: LaneZvkRequest = connectIO(response).asTypeOf(parameter.inputBundle) + + val vs1: UInt = request.src(0) // vs1 / rs1 / uimm + val vs2: UInt = request.src(1) + val vd: UInt = request.src(2) + val vSew: UInt = UIntToOH(request.vSew) // sew = 0, 1, 2 + + private def UInt2BRev8(x: UInt): UInt = VecInit( + x.asBools.grouped(8).map(s => VecInit(s.reverse)).toSeq + ).asUInt // byte's bit reverse + + private def aes_get_column(x: UInt, y: UInt): UInt = { + Mux1H( + UIntToOH(y), + Seq( + x(31, 0), + x(63, 32), + x(95, 64), + x(127, 96) + ) + ) + } + + private def aes_shift_rows_inv(x: UInt): UInt = { + val ic3 = aes_get_column(x, 3.U) + val ic2 = aes_get_column(x, 2.U) + val ic1 = aes_get_column(x, 1.U) + val ic0 = aes_get_column(x, 0.U) + val oc0 = ic1(31, 24) ## ic2(23, 16) ## ic3(15, 8) ## ic0(7, 0) + val oc1 = ic2(31, 24) ## ic3(23, 16) ## ic0(15, 8) ## ic1(7, 0) + val oc2 = ic3(31, 24) ## ic0(23, 16) ## ic1(15, 8) ## ic2(7, 0) + val oc3 = ic0(31, 24) ## ic1(23, 16) ## ic2(15, 8) ## ic3(7, 0) + oc3 ## oc2 ## oc1 ## oc0 + } + + val aes_sbox_inv_table: Vec[UInt] = { + val tmp = Seq( + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, + 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, + 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, + 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, + 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, + 0x84, 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, + 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41, 0x4f, + 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, + 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, + 0x5a, 0xf4, 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, 0x60, + 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d, + 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, + 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + ) + VecInit(tmp.map { s => s.asUInt(8.W) }) + } + private def inv_sbox_lookup(x: UInt): UInt = { + Mux(x === 0.U, aes_sbox_inv_table(0.U), aes_sbox_inv_table(x - 1.U)) + } + + val aes_sbox_fwd_table: Vec[UInt] = { + val tmp = Seq( + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, + 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, + 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, + 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, + 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, + 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, + 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, + 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, + 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, + 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, + 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, + 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, + 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, + 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + ) + VecInit(tmp.map { s => s.asUInt(8.W) }) + } + + private def sbox_lookup(x: UInt): UInt = { + Mux(x === 0.U, aes_sbox_fwd_table(0.U), aes_sbox_fwd_table(x - 1.U)) + } + + private def aes_subword_inv(x: UInt): UInt = { + inv_sbox_lookup(x(31, 24)) ## + inv_sbox_lookup(x(23, 16)) ## + inv_sbox_lookup(x(15, 8)) ## + inv_sbox_lookup(x(7, 0)) + } + + private def aes_subbytes_inv(x: UInt): UInt = { + val ic0 = aes_get_column(x, 0.U) + val ic1 = aes_get_column(x, 1.U) + val ic2 = aes_get_column(x, 2.U) + val ic3 = aes_get_column(x, 3.U) + + val oc0 = aes_subword_inv(ic0) + val oc1 = aes_subword_inv(ic1) + val oc2 = aes_subword_inv(ic2) + val oc3 = aes_subword_inv(ic3) + + oc3 ## oc2 ## oc1 ## oc0 + } + + private def xt2(x: UInt): UInt = { + (x << 1) ^ Mux(x(7) === 1.U, "h1b".U, 0.U) + } + + private def xt3(x: UInt): UInt = { + x ^ xt2(x) + } + + private def gfmul(x: UInt, y: UInt): UInt = { + Mux(y(0) === 1.U, x, 0.U) ^ + Mux(y(1) === 1.U, xt2(x), 0.U) ^ + Mux(y(2) === 1.U, xt2(xt2(x)), 0.U) ^ + Mux(y(3) === 1.U, xt2(xt2(xt2(x))), 0.U) + } + + private def aes_mixcolumn_inv(x: UInt): UInt = { + val s0 = x(7, 0) + val s1 = x(15, 8) + val s2 = x(23, 16) + val s3 = x(31, 24) + + val b0 = gfmul(s0, "hE".U) ^ gfmul(s1, "hB".U) ^ gfmul(s2, "hD".U) ^ gfmul(s3, "h9".U) + val b1 = gfmul(s0, "h9".U) ^ gfmul(s1, "hE".U) ^ gfmul(s2, "hB".U) ^ gfmul(s3, "hD".U) + val b2 = gfmul(s0, "hD".U) ^ gfmul(s1, "h9".U) ^ gfmul(s2, "hE".U) ^ gfmul(s3, "hB".U) + val b3 = gfmul(s0, "hB".U) ^ gfmul(s1, "hD".U) ^ gfmul(s2, "h9".U) ^ gfmul(s3, "hE".U) + + b3 ## b2 ## b1 ## b0 + } + + private def aes_mixcolumns_inv(x: UInt): UInt = { + val ic0 = aes_get_column(x, 0.U) + val ic1 = aes_get_column(x, 1.U) + val ic2 = aes_get_column(x, 2.U) + val ic3 = aes_get_column(x, 3.U) + + val oc0 = aes_mixcolumn_inv(ic0) + val oc1 = aes_mixcolumn_inv(ic1) + val oc2 = aes_mixcolumn_inv(ic2) + val oc3 = aes_mixcolumn_inv(ic3) + + oc3 ## oc2 ## oc1 ## oc0 + } + + private def aes_subword_fwd(x: UInt): UInt = { + sbox_lookup(x(31, 24)) ## + sbox_lookup(x(23, 16)) ## + sbox_lookup(x(15, 8)) ## + sbox_lookup(x(7, 0)) + } + + private def aes_rotword(x: UInt): UInt = { + val a0 = x(7, 0) + val a1 = x(15, 8) + val a2 = x(23, 16) + val a3 = x(31, 24) + + a0 ## a3 ## a2 ## a1 + } + private def aes_subbytes_fwd(x: UInt): UInt = { + val ic0 = aes_get_column(x, 0.U) + val ic1 = aes_get_column(x, 1.U) + val ic2 = aes_get_column(x, 2.U) + val ic3 = aes_get_column(x, 3.U) + + val oc0 = aes_subword_fwd(ic0) + val oc1 = aes_subword_fwd(ic1) + val oc2 = aes_subword_fwd(ic2) + val oc3 = aes_subword_fwd(ic3) + + oc3 ## oc2 ## oc1 ## oc0 + } + + private def aes_shift_rows_fwd(x: UInt): UInt = { + val ic0 = aes_get_column(x, 0.U) + val ic1 = aes_get_column(x, 1.U) + val ic2 = aes_get_column(x, 2.U) + val ic3 = aes_get_column(x, 3.U) + + val oc0 = ic3(31, 24) ## ic2(23, 16) ## ic1(15, 8) ## ic0(7, 0); + val oc1 = ic0(31, 24) ## ic3(23, 16) ## ic2(15, 8) ## ic1(7, 0); + val oc2 = ic1(31, 24) ## ic0(23, 16) ## ic3(15, 8) ## ic2(7, 0); + val oc3 = ic2(31, 24) ## ic1(23, 16) ## ic0(15, 8) ## ic3(7, 0); + + oc3 ## oc2 ## oc1 ## oc0 + } + + private def aes_mixcolumn_fwd(x: UInt): UInt = { + val s0 = x(7, 0) + val s1 = x(15, 8) + val s2 = x(23, 16) + val s3 = x(31, 24) + val b0 = xt2(s0) ^ xt3(s1) ^ (s2) ^ (s3) + val b1 = (s0) ^ xt2(s1) ^ xt3(s2) ^ (s3) + val b2 = (s0) ^ (s1) ^ xt2(s2) ^ xt3(s3) + val b3 = xt3(s0) ^ (s1) ^ (s2) ^ xt2(s3) + + b3 ## b2 ## b1 ## b0 + } + + private def aes_mixcolumns_fwd(x: UInt): UInt = { + val ic0 = aes_get_column(x, 0.U) + val ic1 = aes_get_column(x, 1.U) + val ic2 = aes_get_column(x, 2.U) + val ic3 = aes_get_column(x, 3.U) + + val oc0 = aes_mixcolumn_fwd(ic0) + val oc1 = aes_mixcolumn_fwd(ic1) + val oc2 = aes_mixcolumn_fwd(ic2) + val oc3 = aes_mixcolumn_fwd(ic3) + + oc3 ## oc2 ## oc1 ## oc0 + } + + private def aes_decode_rcon(r: UInt): UInt = { + Mux1H( + UIntToOH(r), + Seq( + "h00000001".U, + "h00000002".U, + "h00000004".U, + "h00000008".U, + "h00000010".U, + "h00000020".U, + "h00000040".U, + "h00000080".U, + "h0000001b".U, + "h00000036".U, + "h00000000".U, + "h00000000".U, + "h00000000".U, + "h00000000".U, + "h00000000".U, + "h00000000".U + ) + ) + } + + private def sig0(x: UInt): UInt = { + // NOTE: only support SEW=32 + x(31, 0).rotateRight(7).asUInt(31, 0) ^ + x(31, 0).rotateRight(18).asUInt(31, 0) ^ + (x(31, 0) >> 3).asUInt(31, 0) + } + + private def sig1(x: UInt): UInt = { + // NOTE: only support SEW=32 + x(31, 0).rotateRight(17).asUInt(31, 0) ^ + x(31, 0).rotateRight(19).asUInt(31, 0) ^ + (x(31, 0) >> 10).asUInt(31, 0) + } + + private def sum0(x: UInt): UInt = { + // NOTE: only support SEW=32 + x(31, 0).rotateRight(2).asUInt(31, 0) ^ + x(31, 0).rotateRight(13).asUInt(31, 0) ^ + x(31, 0).rotateRight(22).asUInt(31, 0) + } + + private def sum1(x: UInt): UInt = { + // NOTE: only support SEW=32 + x(31, 0).rotateRight(6).asUInt(31, 0) ^ + x(31, 0).rotateRight(11).asUInt(31, 0) ^ + x(31, 0).rotateRight(25).asUInt(31, 0) + } + + private def ch(x: UInt, y: UInt, z: UInt): UInt = ((x & y) ^ ((~x) & z)) + + private def maj(x: UInt, y: UInt, z: UInt): UInt = ((x & y) ^ (x & z) ^ (y & z)) + + private def ROUND_KEY(X: UInt, S: UInt) = { + X(31, 0) ^ ( + S ^ + S(31, 0).rotateLeft(13).asUInt(31, 0) ^ + S(31, 0).rotateLeft(23).asUInt(31, 0) + ) + } + + val sm4_sbox_table: Vec[UInt] = { + val tmp = Seq( + 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, 0x2b, 0x67, 0x9a, + 0x76, 0x2a, 0xbe, 0x04, 0xc3, 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, + 0x98, 0x7a, 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, 0x80, + 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, 0x83, 0x59, 0x3c, 0x19, + 0xe6, 0x85, 0x4f, 0xa8, 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, + 0x35, 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, 0xd4, 0x00, + 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, 0xea, 0xbf, 0x8a, 0xd2, 0x40, + 0xc7, 0x38, 0xb5, 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, + 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, 0xc0, 0x29, 0x23, + 0xab, 0x0d, 0x53, 0x4e, 0x6f, 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, + 0x5b, 0x51, 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, 0x0a, + 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, 0x89, 0x69, 0x97, 0x4a, + 0x0c, 0x96, 0x77, 0x7e, 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, + 0x20, 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48 + ) + VecInit(tmp.map { s => s.asUInt(8.W) }) + } + + private def sm_sbox_lookup(x: UInt): UInt = { + Mux(x === 0.U, sm4_sbox_table(0.U), sm4_sbox_table(x - 1.U)) + } + + private def sm4_subword(x: UInt): UInt = { + sm_sbox_lookup(x(31, 24)) ## + sm_sbox_lookup(x(23, 16)) ## + sm_sbox_lookup(x(15, 8)) ## + sm_sbox_lookup(x(7, 0)) + } + + val ck: Vec[UInt] = { + val tmp = Seq( + BigInt("00070e15", 16), + BigInt("1c232a31", 16), + BigInt("383f464d", 16), + BigInt("545b6269", 16), + BigInt("70777e85", 16), + BigInt("8c939aa1", 16), + BigInt("a8afb6bd", 16), + BigInt("c4cbd2d9", 16), + BigInt("e0e7eef5", 16), + BigInt("fc030a11", 16), + BigInt("181f262d", 16), + BigInt("343b4249", 16), + BigInt("50575e65", 16), + BigInt("6c737a81", 16), + BigInt("888f969d", 16), + BigInt("a4abb2b9", 16), + BigInt("c0c7ced5", 16), + BigInt("dce3eaf1", 16), + BigInt("f8ff060d", 16), + BigInt("141b2229", 16), + BigInt("30373e45", 16), + BigInt("4c535a61", 16), + BigInt("686f767d", 16), + BigInt("848b9299", 16), + BigInt("a0a7aeb5", 16), + BigInt("bcc3cad1", 16), + BigInt("d8dfe6ed", 16), + BigInt("f4fb0209", 16), + BigInt("10171e25", 16), + BigInt("2c333a41", 16), + BigInt("484f565d", 16), + BigInt("646b7279", 16) + ) + VecInit(tmp.map { s => s.asUInt(32.W) }) + } + + private def sm4_round(X: UInt, S: UInt): UInt = { + X(31, 0) ^ ( + S(31, 0) ^ + S(31, 0).rotateLeft(2).asUInt(31, 0) ^ + S(31, 0).rotateLeft(10).asUInt(31, 0) ^ + S(31, 0).rotateLeft(18).asUInt(31, 0) ^ + S(31, 0).rotateLeft(24).asUInt(31, 0) + ) + } + + // for vghsh.vv, vgmul.vv + val isVGHSH = (request.opcode === 0.U) + val outVG = { + val valZ = 0.U(parameter.datapathWidth.W) + val valS = UInt2BRev8(Mux(isVGHSH, vd ^ vs1, vd)) + val valH = Mux(isVGHSH, vs2, UInt2BRev8(vs2)) + + // TODO: 128 rounds + // valZ := Mux(valS(0), valZ ^ valH, valZ) + // valH := Mux(valH(127), (valH << 1) & "h87".U, valH << 1.U) + + UInt2BRev8(valZ) + } + + val outAESDF = { + val state = vd + val rkey = vs2 // TODO: for vs, rkey is fixed to the first lane, support it before this module + val sr = aes_shift_rows_inv(state) + val sb = aes_subbytes_inv(sr) + val ark = sb ^ rkey + ark + } + + val outAESDM = { + val state = vd + val rkey = vs2 // TODO: for vs, rkey is fixed to the first lane + val sr = aes_shift_rows_inv(state) + val sb = aes_subbytes_inv(sr) + val ark = sb ^ rkey + val mix = aes_mixcolumns_inv(ark) + mix + } + + val outAESEF = { + val state = vd + val rkey = vs2 // TODO: for vs, rkey is fixed to the first lane + val sb = aes_subbytes_fwd(state) + val sr = aes_shift_rows_fwd(sb) + val ark = sr ^ rkey + ark + } + + val outAESEM = { + val state = vd + val rkey = vs2 // TODO: for vs, rkey is fixed to the first lane + val sb = aes_subbytes_fwd(state) + val sr = aes_shift_rows_fwd(sb) + val mix = aes_mixcolumns_fwd(sr) + val ark = mix ^ rkey + ark + } + + val outAESZ = { + val state = vd + val rkey = vs2 // TODO: rkey is fixed to the first lane + val ark = state ^ rkey + ark + } + + val outAESKF1 = { + val currentRoundKey = vs2 + val rnd = Mux((vs1(3, 0) >= 10.U) | (vs1(3, 0) === 0.U), (~vs1(3)) ## vs1(2, 0), vs1(3, 0)) + val r = rnd - 1.U + + val w0 = aes_subword_fwd(aes_rotword(currentRoundKey(127, 96))) ^ + aes_decode_rcon(r) ^ currentRoundKey(31, 0) + val w1 = w0 ^ currentRoundKey(63, 32) + val w2 = w1 ^ currentRoundKey(95, 64) + val w3 = w2 ^ currentRoundKey(127, 96) + + w3 ## w2 ## w1 ## w0 + } + + val outAESKF2 = { + val currentRoundKey = vs2 + val roundKey = vd + val rnd = Mux((vs1(3, 0) < 2.U) | (vs1(3, 0) > 14.U), (~vs1(3)) ## vs1(2, 0), vs1(3, 0)) + + val w0 = Mux( + rnd(0) === 1.U, + aes_subword_fwd(currentRoundKey(127, 96)) ^ roundKey(31, 0), + aes_subword_fwd(aes_rotword(currentRoundKey(127, 96))) ^ + aes_decode_rcon((rnd >> 1.U) - 1.U) ^ + roundKey(31, 0) + ) + val w1 = w0 ^ roundKey(63, 32) + val w2 = w1 ^ roundKey(95, 64) + val w3 = w2 ^ roundKey(127, 96) + + w3 ## w2 ## w1 ## w0 + } + + val outSHA2MS = { + val mw = vd + val mx = vs2 + val my = vs1 + + val mz0 = sig1(my(95, 64)) + mx(63, 32) + sig0(mw(63, 32)) + mw(31, 0) + val mz1 = sig1(my(127, 96)) + mx(95, 64) + sig0(mw(95, 64)) + mw(63, 32) + val mz2 = sig1(mz0) + mx(127, 96) + sig0(mw(127, 96)) + mw(95, 64) + val mz3 = sig1(mz1) + my(31, 0) + sig0(mx(31, 0)) + mw(127, 96) + + mz3 ## mz2 ## mz1 ## mz0 + } + + val outSHA2CHL = { + val isVSHA2CL = (request.opcode === 13.U) + val ma = vs2(127, 96) + val mb = vs2(95, 64) + val me = vs2(63, 32) + val mf = vs2(31, 0) + + val mc = vd(127, 96) + val md = vd(95, 64) + val mg = vd(63, 32) + val mh = vd(31, 0) + + val messageShedPlusCa = vs1(127, 96) + val messageShedPlusCb = vs1(95, 64) + val messageShedPlusCc = vs1(63, 32) + val messageShedPlusCd = vs1(31, 0) + + val w1 = Mux(isVSHA2CL, messageShedPlusCc, messageShedPlusCa) + val w0 = Mux(isVSHA2CL, messageShedPlusCd, messageShedPlusCb) + + val t1 = mh + sum1(me) + ch(me, mf, mg) + w0 + val t2 = sum0(ma) + maj(ma, mb, mc) + + val mmh = mg + val mmg = mf + val mmf = me + val mme = md + t1 + val mmd = mc + val mmc = mb + val mmb = ma + val mma = t1 + t2 + val mt1 = mmh + sum1(mme) + ch(mme, mmf, mmg) + w1 + val mt2 = sum0(mma) + maj(mma, mmb, mmc) + + val mmmh = mmg + val mmmg = mmf + val mmmf = mme + val mmme = mmd + mt1 + val mmmd = mmc + val mmmc = mmb + val mmmb = mma + val mmma = mt1 + mt2 + + mmma ## mmmb ## mmme ## mmmf + } + + val outSM4K = { + val rnd = vs1(2, 0) + + val rk3 = vs2(127, 96) + val rk2 = vs2(95, 64) + val rk1 = vs2(63, 32) + val rk0 = vs2(31, 0) + + val B = rk1 ^ rk2 ^ rk3 ^ ck(rnd << 2.U)(31, 0) + val S = sm4_subword(B) + val rk4 = ROUND_KEY(rk0, S) + + val mB = rk2 ^ rk3 ^ rk4 ^ ck((rnd << 2.U) + 1.U)(31, 0) + val mS = sm4_subword(mB) + val rk5 = ROUND_KEY(rk1, S) + + val mmB = rk3 ^ rk4 ^ rk5 ^ ck((rnd << 2.U) + 2.U)(31, 0) + val mmS = sm4_subword(mmB) + + val rk6 = ROUND_KEY(rk2, mmS) + val mmmB = rk4 ^ rk5 ^ rk6 ^ ck((rnd << 2.U) + 3.U)(31, 0) + val mmmS = sm4_subword(mmmB) + + val rk7 = ROUND_KEY(rk3, mmmS) + + rk7 ## rk6 ## rk5 ## rk4 + } + + val outSM4R = { + val rk3 = vs2(127, 96) // TODO: for vs, rkey is fixed to the first lane + val rk2 = vs2(95, 64) // TODO: for vs, rkey is fixed to the first lane + val rk1 = vs2(63, 32) // TODO: for vs, rkey is fixed to the first lane + val rk0 = vs2(31, 0) // TODO: for vs, rkey is fixed to the first lane + + val x3 = vd(127, 96) + val x2 = vd(95, 64) + val x1 = vd(63, 32) + val x0 = vd(31, 0) + + val mb = x1 ^ x2 ^ x3 ^ rk0 + val ms = sm4_subword(mb) + val x4 = sm4_round(x0, ms) + + val mmb = x2 ^ x3 ^ x4 ^ rk1 + val mms = sm4_subword(mmb) + val x5 = sm4_round(x1, mms) + + val mmmb = x3 ^ x4 ^ x5 ^ rk2 + val mmms = sm4_subword(mmmb) + val x6 = sm4_round(x2, mmms) + + val mmmmb = x4 ^ x5 ^ x6 ^ rk3 + val mmmms = sm4_subword(mmmmb) + val x7 = sm4_round(x3, mmmms) + + x7 ## x6 ## x5 ## x4 + } + + response.data := Mux1H( + UIntToOH(request.opcode), + Seq( + outVG, + outVG, + outAESDF, + outAESDM, + outAESEF, + outAESEM, + outAESZ, + outAESKF1, + outAESKF2, + outSHA2MS, + outSHA2CHL, + outSM4K, + outSM4R, + outSHA2CHL + ) + ) +} diff --git a/t1/src/LaneZvk256.scala b/t1/src/LaneZvk256.scala new file mode 100644 index 000000000..f10016794 --- /dev/null +++ b/t1/src/LaneZvk256.scala @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3.experimental.hierarchy.instantiable +import chisel3._ +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.{BoolField, Decoder} + +object LaneZvk256Param { + implicit def rw: upickle.default.ReadWriter[LaneZvk256Param] = upickle.default.macroRW +} + +case class LaneZvk256Param(datapathWidth: Int, latency: Int) extends VFUParameter with SerializableModuleParameter { + val inputBundle = new LaneZvk256Request(datapathWidth) // TODO: make `datapathWidth` as 256 bits + val decodeField: BoolField = Decoder.zvk256 + val outputBundle = new LaneZvk256Response(datapathWidth) + override val NeedSplit: Boolean = false +} + +class LaneZvk256Request(datapathWidth: Int) extends VFUPipeBundle { + val src = Vec(3, UInt(datapathWidth.W)) + val opcode = UInt(4.W) + val vSew = UInt(2.W) + // val shifterSize = UInt(log2Ceil(datapathWidth).W) +} + +class LaneZvk256Response(datapathWidth: Int) extends VFUPipeBundle { + val data = UInt(datapathWidth.W) +} + +@instantiable +class LaneZvk256(val parameter: LaneZvk256Param) extends VFUModule(parameter) with SerializableModule[LaneZvk256Param] { + val response: LaneZvk256Response = Wire(new LaneZvk256Response(parameter.datapathWidth)) + val request: LaneZvk256Request = connectIO(response).asTypeOf(parameter.inputBundle) + + val vs1: UInt = request.src(0) + val vs2: UInt = request.src(1) + val vd: UInt = request.src(2) + val vSew: UInt = UIntToOH(request.vSew) // sew = 0, 1, 2 + + private def rev8(x: UInt): UInt = { + VecInit(x.asBools.grouped(8).map(s => VecInit(s)).toSeq.reverse).asUInt // element's byte reverse + } + + private def FF1(X: UInt, Y: UInt, Z: UInt): UInt = ((X) ^ (Y) ^ (Z)) + + private def FF2(X: UInt, Y: UInt, Z: UInt): UInt = (((X) & (Y)) | ((X) & (Z)) | ((Y) & (Z))) + + private def FF_j(X: UInt, Y: UInt, Z: UInt, J: UInt): UInt = { + Mux(((J) <= 15.U), FF1(X, Y, Z), FF2(X, Y, Z)) + } + + private def GG1(X: UInt, Y: UInt, Z: UInt): UInt = ((X) ^ (Y) ^ (Z)) + + private def GG2(X: UInt, Y: UInt, Z: UInt): UInt = (((X) & (Y)) | ((~(X)) & (Z))) + + private def GG_j(X: UInt, Y: UInt, Z: UInt, J: UInt): UInt = { + Mux(((J) <= 15.U), GG1(X, Y, Z), GG2(X, Y, Z)) + } + + private def T_j(J: UInt): UInt = { + Mux(((J) <= 15.U), ("h79CC4519".U), ("h7A879D8A".U)) + } + + private def P_0(X: UInt): UInt = { + (X) ^ + X.rotateLeft(9) ^ + X.rotateLeft(17) + } + + private def P_1(X: UInt): UInt = { + ((X) ^ X.rotateLeft(15)) ^ X.rotateLeft(23) + } + private def ZVKSH_W(M16: UInt, M9: UInt, M3: UInt, M13: UInt, M6: UInt): UInt = { + P_1((M16) ^ (M9) ^ (M3.rotateLeft(15))) ^ + M13.rotateLeft(7) ^ + M6 + } + + val outSM3C = { + val hi = vd(255, 224) + val gi = vd(223, 192) + val fi = vd(191, 160) + val ei = vd(159, 128) + val di = vd(127, 96) + val ci = vd(95, 64) + val bi = vd(63, 32) + val ai = vd(31, 0) + + val u_w7 = vs2(255, 224) + val u_w6 = vs2(223, 192) + val w5i = vs2(191, 160) + val w4i = vs2(159, 128) + val u_w3 = vs2(127, 96) + val u_w2 = vs2(95, 64) + val w1i = vs2(63, 32) + val w0i = vs2(31, 0) + + val mH = rev8(hi) + val mG = rev8(gi) + val mF = rev8(fi) + val mE = rev8(ei) + val mD = rev8(di) + val mC = rev8(ci) + val mB = rev8(bi) + val mA = rev8(ai) + + val w5 = rev8(w5i) + val w4 = rev8(w4i) + val w1 = rev8(w1i) + val w0 = rev8(w0i) + + val x0 = w0 ^ w4 + val x1 = w1 ^ w5 + + val rnds = vs1(4, 0) + val j = rnds << 1 + val ss1 = (mA(31, 0).rotateLeft(12) + mE + T_j(j).rotateLeft(j(4, 0))).rotateLeft(7) + val ss2 = ss1 ^ mA(31, 0).rotateLeft(12) + val tt1 = FF_j(mA, mB, mC, j) + mD + ss2 + x0 + val tt2 = GG_j(mE, mF, mG, j) + mH + ss1 + w0 + + val mmD = mC + val mC1 = mB.rotateLeft(9) + val mmB = mA + + val mA1 = tt1 + val mmH = mG + val mG1 = mF.rotateLeft(19) + val mmF = mE + val mE1 = P_0(tt2) + + val j1 = (rnds << 1) + 1.U + val mss1 = (mA1.rotateLeft(12) + mE1 + T_j(j1).rotateLeft(j(4, 0))).rotateLeft(7) + val mss2 = mss1 ^ mA1.rotateLeft(12) + + val mtt1 = FF_j(mA1, mmB, mC1, j1) + mmD + mss2 + x1 + val mtt2 = GG_j(mE1, mmF, mG1, j1) + mmH + mss1 + w1 + val mmmD = mC1 + val mC2 = mmB.rotateLeft(9) + val mmmB = mA1 + val mA2 = mtt1 + val mmmH = mG1 + val mG2 = mmF.rotateLeft(19) + val mmmF = mE1 + val mE2 = P_0(mtt2) + + mG1 ## mG2 ## mE1 ## mE2 ## mC1 ## mC2 ## mA1 ## mA2 + } + + val outSM3ME = { + val w7 = rev8(vs1(255, 224)) + val w6 = rev8(vs1(223, 192)) + val w5 = rev8(vs1(191, 160)) + val w4 = rev8(vs1(159, 128)) + val w3 = rev8(vs1(127, 96)) + val w2 = rev8(vs1(95, 64)) + val w1 = rev8(vs1(63, 32)) + val w0 = rev8(vs1(31, 0)) + + val w15 = rev8(vs2(255, 224)) + val w14 = rev8(vs2(223, 192)) + val w13 = rev8(vs2(191, 160)) + val w12 = rev8(vs2(159, 128)) + val w11 = rev8(vs2(127, 96)) + val w10 = rev8(vs2(95, 64)) + val w9 = rev8(vs2(63, 32)) + val w8 = rev8(vs2(31, 0)) + + val mw16 = rev8(ZVKSH_W(w0, w7, w13, w3, w10)) + val mw17 = rev8(ZVKSH_W(w1, w8, w14, w4, w11)) + val mw18 = rev8(ZVKSH_W(w2, w9, w15, w5, w12)) + val mw19 = rev8(ZVKSH_W(w3, w10, mw16, w6, w13)) + val mw20 = rev8(ZVKSH_W(w4, w11, mw17, w7, w14)) + val mw21 = rev8(ZVKSH_W(w5, w12, mw18, w8, w15)) + val mw22 = rev8(ZVKSH_W(w6, w13, mw19, w9, mw16)) + val mw23 = rev8(ZVKSH_W(w7, w14, mw20, w10, mw17)) + + mw23 ## mw22 ## mw21 ## mw20 ## mw19 ## mw18 ## mw17 ## mw16 + } + + response.data := Mux1H( + UIntToOH(request.opcode(0)), + Seq( + outSM3C, + outSM3ME + ) + ) +} diff --git a/t1/src/T1.scala b/t1/src/T1.scala index a7ab45e5b..27a21300d 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -127,9 +127,17 @@ case class T1Parameter( .instructions(org.chipsalliance.rvdecoderdb.extractResource(getClass.getClassLoader)) .filter { instruction => instruction.instructionSet.name match { - case "rv_v" => true - case "rv_zvbb" => if (zvbbEnable) true else false - case _ => false + case "rv_v" => true + case "rv_zvbb" => if (zvbbEnable) true else false + // Zvk + case "rv_zvkg" => if (zvkEnable) true else false + // case "rv_zvkn" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvkned" => if (zvkEnable) true else false + case "rv_zvknha" => if (zvkEnable) true else false + // case "rv_zvknhb" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvksed" => if (zvkEnable) true else false + case "rv_zvksh" => if (zvkEnable) true else false + case _ => false } } ++ t1customInstructions.map(_.instruction) @@ -140,7 +148,7 @@ case class T1Parameter( } } - require(extensions.forall(Seq("Zve32x", "Zve32f", "Zvbb").contains), "unsupported extension.") + require(extensions.forall(Seq("Zve32x", "Zve32f", "Zvbb", "Zvk").contains), "unsupported extension.") // TODO: require bank not overlap /** xLen of T1, we currently only support 32. */ val xLen: Int = 32 @@ -151,15 +159,19 @@ case class T1Parameter( /** TODO: configure it. */ val instructionQueueSize: Int = 4 - /** crosslane write token size */ - val vrfWriteQueueSize: Int = 4 + /** crosslane write token size, unclear how many would be good */ + val vrfWriteQueueSize: Int = 4 + val vrfWriteZvkQueueSize: Int = 8 /** does t1 has floating datapath? */ val fpuEnable: Boolean = extensions.contains("Zve32f") - /** support of zvbb */ + /** support of Zvbb */ lazy val zvbbEnable: Boolean = extensions.contains("Zvbb") + /** support of Zvk */ + lazy val zvkEnable: Boolean = extensions.contains("Zvk") + /** how many chaining does T1 support, this is not a parameter yet. */ val chainingSize: Int = 4 @@ -227,9 +239,11 @@ case class T1Parameter( // each element: Each lane will be connected to the other two lanes, // and the values are their respective delays. - val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) + val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) + val zvkCrossLaneConnectCycles: Option[Seq[Seq[Int]]] = + Option.when(zvkEnable)(Seq.tabulate(laneNumber)(_ => Seq(1, 1, 1, 1))) - val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, zvkEnable, allInstructions) /** paraemter for AXI4. */ val axi4BundleParameter: AXI4BundleParameter = AXI4BundleParameter( @@ -265,7 +279,9 @@ case class T1Parameter( laneNumber = laneNumber, chainingSize = chainingSize, crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, + crossLaneVRFWriteEscapeZvkQueueSize = vrfWriteZvkQueueSize, fpuEnable = fpuEnable, + zvkEnable = zvkEnable, portFactor = vrfBankSize, vrfRamType = vrfRamType, decoderParam = decoderParam, @@ -292,7 +308,7 @@ case class T1Parameter( axi4BundleParameter = axi4BundleParameter, name = "main" ) - def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, vrfRamType) + def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, zvkEnable, vrfRamType) require(xLen == datapathWidth) def adderParam: LaneAdderParam = LaneAdderParam(datapathWidth, 0) } @@ -1660,6 +1676,39 @@ class T1(val parameter: T1Parameter) } } + if (parameter.zvkEnable) { + parameter.zvkCrossLaneConnectCycles.get.zipWithIndex.foreach { case (cycles, index) => + cycles.zipWithIndex.foreach { case (cycle, portIndex) => + // read source <=> write sink + val readSourceIndex = (4 * index + portIndex) % parameter.laneNumber + val readSourcePort = (4 * index + portIndex) / parameter.laneNumber + // println("testing", readSourcePort) + + // read connect + laneVec(readSourceIndex).zvkReadBusPort.get(readSourcePort).deqRelease := Pipe( + laneVec(index).zvkReadBusPort.get(portIndex).enqRelease, + 0.U.asTypeOf(new EmptyBundle), + cycle + ).valid + connectWithShifter(cycle)( + laneVec(readSourceIndex).zvkReadBusPort.get(readSourcePort).deq, + laneVec(index).zvkReadBusPort.get(portIndex).enq + ) + + // write connect + laneVec(index).zvkWriteBusPort.get(portIndex).deqRelease := Pipe( + laneVec(readSourceIndex).zvkWriteBusPort.get(readSourcePort).enqRelease, + 0.U.asTypeOf(new EmptyBundle), + cycle + ).valid + connectWithShifter(cycle)( + laneVec(index).zvkWriteBusPort.get(portIndex).deq, + laneVec(readSourceIndex).zvkWriteBusPort.get(readSourcePort).enq + ) + } + } + } + io.highBandwidthLoadStorePort <> lsu.axi4Port io.indexedLoadStorePort <> lsu.simpleAccessPorts // 暂时直接连lsu的写,后续需要处理scheduler的写 diff --git a/t1/src/VectorFunctionUnit.scala b/t1/src/VectorFunctionUnit.scala index e8fdf7a27..329b619f8 100644 --- a/t1/src/VectorFunctionUnit.scala +++ b/t1/src/VectorFunctionUnit.scala @@ -106,7 +106,8 @@ case class VFUInstantiateParameter( divfpModuleParameters: Seq[(SerializableModuleGenerator[LaneDivFP, LaneDivFPParam], Seq[Int])], otherModuleParameters: Seq[(SerializableModuleGenerator[OtherUnit, OtherUnitParam], Seq[Int])], floatModuleParameters: Seq[(SerializableModuleGenerator[LaneFloat, LaneFloatParam], Seq[Int])], - zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])]) { + zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])], + zvkModuleParameters: Seq[(SerializableModuleGenerator[LaneZvk, LaneZvkParam], Seq[Int])]) { val genVec: Seq[(SerializableModuleGenerator[_ <: VFUModule, _ <: VFUParameter], Seq[Int])] = logicModuleParameters ++ aluModuleParameters ++ @@ -116,7 +117,8 @@ case class VFUInstantiateParameter( divfpModuleParameters ++ otherModuleParameters ++ floatModuleParameters ++ - zvbbModuleParameters + zvbbModuleParameters ++ + zvkModuleParameters genVec.foreach { case (_, connect) => connect.foreach(connectIndex => require(connectIndex < slotCount)) } diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 4072ae589..d68101073 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -13,7 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.attribute._ object DecoderParam { implicit def rwP: upickle.default.ReadWriter[DecoderParam] = upickle.default.macroRW } -case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, allInstructions: Seq[Instruction]) +case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, zvkEnable: Boolean, allInstructions: Seq[Instruction]) trait T1DecodeFiled[D <: Data] extends DecodeField[T1DecodePattern, D] with FieldName @@ -225,6 +225,18 @@ object Decoder { override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvbb.value } + object zvk extends BoolField { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk.value + } + + object zvk128 extends BoolField { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk128.value + } + + object zvk256 extends BoolField { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk256.value + } + object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { case _: TopT0.type => BitPat("b000") @@ -346,6 +358,24 @@ object Decoder { case _: zvbbUop9.type => BitPat("b1001") // pop case _ => BitPat.dontCare(4) } + case zvkCase: ZvkUOPType => + zvkCase match { + case _: zvkUop0.type => BitPat("b0000") // vghsh / vsm3c + case _: zvkUop1.type => BitPat("b0001") // vgmul / vsm3me + case _: zvkUop2.type => BitPat("b0010") // vaesdf + case _: zvkUop3.type => BitPat("b0011") // vaesdm + case _: zvkUop4.type => BitPat("b0100") // vaesef + case _: zvkUop5.type => BitPat("b0101") // vaesem + case _: zvkUop6.type => BitPat("b0110") // vaesz + case _: zvkUop7.type => BitPat("b0111") // vaeskf1 + case _: zvkUop8.type => BitPat("b1000") // vaeskf2 + case _: zvkUop9.type => BitPat("b1001") // vsha2ms + case _: zvkUop10.type => BitPat("b1010") // vsha2ch + case _: zvkUop11.type => BitPat("b1011") // vsm4k + case _: zvkUop12.type => BitPat("b1100") // vsm4r + case _: zvkUop13.type => BitPat("b1101") // vsha2cl + case _ => BitPat.dontCare(4) + } case _ => BitPat.dontCare(4) } } @@ -423,6 +453,14 @@ object Decoder { zvbb ) else Seq() + } ++ { + if (param.zvkEnable) + Seq( + zvk, + zvk128, + zvk256 + ) + else Seq() } def allDecodePattern(param: DecoderParam): Seq[T1DecodePattern] = param.allInstructions.map(T1DecodePattern(_, param)).toSeq.sortBy(_.instruction.name) diff --git a/t1/src/decoder/InstructionDocumentation.scala b/t1/src/decoder/InstructionDocumentation.scala index 4cd498403..ffe6c0e11 100644 --- a/t1/src/decoder/InstructionDocumentation.scala +++ b/t1/src/decoder/InstructionDocumentation.scala @@ -440,5 +440,31 @@ case class InstructionDocumentation(instruction: Instruction, param: DecoderPara case "vwsll.vv" => "TODO!" case "vwsll.vx" => "TODO!" case "vwsll.vi" => "TODO!" + // rv_zvkg + case "vghsh.vv" => "TODO!" + case "vgmul.vv" => "TODO!" + // rv_zvkned + case "vaesdf.vv" => "TODO!" + case "vaesdf.vs" => "TODO!" + case "vaesdm.vv" => "TODO!" + case "vaesdm.vs" => "TODO!" + case "vaesef.vv" => "TODO!" + case "vaesef.vs" => "TODO!" + case "vaesem.vv" => "TODO!" + case "vaesem.vs" => "TODO!" + case "vaesz.vs" => "TODO!" + case "vaeskf1.vi" => "TODO!" + case "vaeskf2.vi" => "TODO!" + // rv_zvknha + case "vsha2ms.vv" => "TODO!" + case "vsha2ch.vv" => "TODO!" + case "vsha2cl.vv" => "TODO!" + // rv_zvksed + case "vsm4k.vi" => "TODO!" + case "vsm4r.vv" => "TODO!" + case "vsm4r.vs" => "TODO!" + // rv_zvksh + case "vsm3c.vi" => "TODO!" + case "vsm3me.vv" => "TODO!" } } diff --git a/t1/src/decoder/T1DecodePattern.scala b/t1/src/decoder/T1DecodePattern.scala index 2193b1f73..f0460c539 100644 --- a/t1/src/decoder/T1DecodePattern.scala +++ b/t1/src/decoder/T1DecodePattern.scala @@ -107,6 +107,9 @@ case class T1DecodePattern(instruction: Instruction, param: DecoderParam) extend def isVwmacc: isVwmacc = attribute.isVwmacc(this) def isWidenreduce: isWidenreduce = attribute.isWidenreduce(this) def isZvbb: isZvbb = attribute.isZvbb(this) + def isZvk: isZvk = attribute.isZvk(this) + def isZvk128: isZvk128 = attribute.isZvk128(this) + def isZvk256: isZvk256 = attribute.isZvk256(this) def fpExecutionType: FpExecutionType.Type = attribute.FpExecutionType(this) def topUop: TopUop = attribute.TopUop(this) def decoderUop: DecoderUop = attribute.DecoderUop(this) diff --git a/t1/src/decoder/attribute/isItype.scala b/t1/src/decoder/attribute/isItype.scala index 7449c0b9b..dd99ce950 100644 --- a/t1/src/decoder/attribute/isItype.scala +++ b/t1/src/decoder/attribute/isItype.scala @@ -53,7 +53,14 @@ object isItype { "vxor.vi", // rv_zvbb "vror.vi", - "vwsll.vi" + "vwsll.vi", + // rv_zvkned + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvksed + "vsm4k.vi", + // rv_zvksh + "vsm3c.vi" ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isVtype.scala b/t1/src/decoder/attribute/isVtype.scala index 52bc00e50..49a11c5ba 100644 --- a/t1/src/decoder/attribute/isVtype.scala +++ b/t1/src/decoder/attribute/isVtype.scala @@ -185,7 +185,23 @@ object isVtype { "vandn.vv", "vrol.vv", "vror.vv", - "vwsll.vv" + "vwsll.vv", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdm.vv", + "vaesef.vv", + "vaesem.vv", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4r.vv", + // rv_zvksh + "vsm3me.vv" ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isZvk.scala b/t1/src/decoder/attribute/isZvk.scala new file mode 100644 index 000000000..f99bdca32 --- /dev/null +++ b/t1/src/decoder/attribute/isZvk.scala @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +object isZvk { + def apply(t1DecodePattern: T1DecodePattern): isZvk = + Seq( + y _ -> Y, + n _ -> N, + dc _ -> DC + ).collectFirst { + case (fn, tri) if fn(t1DecodePattern) => isZvk(tri) + }.get + + def y(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = + if (t1DecodePattern.param.zvkEnable) + Seq( + "vghsh.vv", + "vgmul.vv", + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + "vsm3c.vi", + "vsm3me.vv" + ) + else Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def n(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = t1DecodePattern.param.allInstructions.filter(i => !(y(t1DecodePattern) || dc(t1DecodePattern))) + allMatched.contains(t1DecodePattern.instruction) + } + + def dc(t1DecodePattern: T1DecodePattern): Boolean = false +} + +case class isZvk(value: TriState) extends BooleanDecodeAttribute { + override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk]]." +} diff --git a/t1/src/decoder/attribute/isZvk128.scala b/t1/src/decoder/attribute/isZvk128.scala new file mode 100644 index 000000000..986feb566 --- /dev/null +++ b/t1/src/decoder/attribute/isZvk128.scala @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +object isZvk128 { + def apply(t1DecodePattern: T1DecodePattern): isZvk128 = + Seq( + y _ -> Y, + n _ -> N, + dc _ -> DC + ).collectFirst { + case (fn, tri) if fn(t1DecodePattern) => isZvk128(tri) + }.get + + def y(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = + if (t1DecodePattern.param.zvkEnable) + Seq( + "vghsh.vv", + "vgmul.vv", + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs" + ) + else Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def n(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = t1DecodePattern.param.allInstructions.filter(i => !(y(t1DecodePattern) || dc(t1DecodePattern))) + allMatched.contains(t1DecodePattern.instruction) + } + + def dc(t1DecodePattern: T1DecodePattern): Boolean = false +} + +case class isZvk128(value: TriState) extends BooleanDecodeAttribute { + override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk]]." +} diff --git a/t1/src/decoder/attribute/isZvk256.scala b/t1/src/decoder/attribute/isZvk256.scala new file mode 100644 index 000000000..cbf7ad255 --- /dev/null +++ b/t1/src/decoder/attribute/isZvk256.scala @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +object isZvk256 { + def apply(t1DecodePattern: T1DecodePattern): isZvk256 = + Seq( + y _ -> Y, + n _ -> N, + dc _ -> DC + ).collectFirst { + case (fn, tri) if fn(t1DecodePattern) => isZvk256(tri) + }.get + + def y(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = + if (t1DecodePattern.param.zvkEnable) + Seq( + "vsm3c.vi", + "vsm3me.vv" + ) + else Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def n(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = t1DecodePattern.param.allInstructions.filter(i => !(y(t1DecodePattern) || dc(t1DecodePattern))) + allMatched.contains(t1DecodePattern.instruction) + } + + def dc(t1DecodePattern: T1DecodePattern): Boolean = false +} + +case class isZvk256(value: TriState) extends BooleanDecodeAttribute { + override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk256]]." +} diff --git a/t1/src/decoder/attribute/zvkUop.scala b/t1/src/decoder/attribute/zvkUop.scala new file mode 100644 index 000000000..b7b8d2c23 --- /dev/null +++ b/t1/src/decoder/attribute/zvkUop.scala @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +trait ZvkUOPType extends Uop +object zvkUop0 extends ZvkUOPType +object zvkUop1 extends ZvkUOPType +object zvkUop2 extends ZvkUOPType +object zvkUop3 extends ZvkUOPType +object zvkUop4 extends ZvkUOPType +object zvkUop5 extends ZvkUOPType +object zvkUop6 extends ZvkUOPType +object zvkUop7 extends ZvkUOPType +object zvkUop8 extends ZvkUOPType +object zvkUop9 extends ZvkUOPType +object zvkUop10 extends ZvkUOPType +object zvkUop11 extends ZvkUOPType +object zvkUop12 extends ZvkUOPType +object zvkUop13 extends ZvkUOPType + +object ZvkUOP { + def apply(t1DecodePattern: T1DecodePattern): Uop = { + Seq( + t0 _ -> zvkUop0, + t1 _ -> zvkUop1, + t2 _ -> zvkUop2, + t3 _ -> zvkUop3, + t4 _ -> zvkUop4, + t5 _ -> zvkUop5, + t6 _ -> zvkUop6, + t7 _ -> zvkUop7, + t8 _ -> zvkUop8, + t9 _ -> zvkUop9, + t10 _ -> zvkUop10, + t11 _ -> zvkUop11, + t12 _ -> zvkUop12, + t13 _ -> zvkUop13 + ).collectFirst { + case (fn, tpe) if fn(t1DecodePattern) => tpe + }.getOrElse(UopDC) + } + def t0(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vghsh.vv", + "vsm3c.vi" // reuse for zvk256 + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t1(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vgmul.vv", + "vsm3me.vv" // reuse for zvk256 + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t2(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaesdf.vv", + "vaesdf.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t3(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaesdm.vv", + "vaesdm.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t4(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaesef.vv", + "vaesef.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t5(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaesem.vv", + "vaesem.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t6(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaesz.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t7(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaeskf1.vi" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t8(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vaeskf2.vi" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t9(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vsha2ms.vv" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t10(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vsha2ch.vv" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t11(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vsm4k.vi" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t12(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vsm4r.vv", + "vsm4r.vs" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t13(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + "vsha2cl.vv" + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +} diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala index 56405aac9..a0947ae11 100644 --- a/t1/src/laneStage/LaneExecutionBridge.scala +++ b/t1/src/laneStage/LaneExecutionBridge.scala @@ -23,6 +23,8 @@ import org.chipsalliance.t1.rtl.decoder.Decoder class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) val bordersForMaskLogic: Bool = Bool() val mask: UInt = UInt((parameter.datapathWidth / 8).W) val maskForFilter: UInt = UInt((parameter.datapathWidth / 8).W) @@ -39,11 +41,13 @@ class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends } class LaneExecuteResponse(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { - val data: UInt = UInt(parameter.datapathWidth.W) - val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) - val crossWriteData: Option[Vec[UInt]] = Option.when(isLastSlot)(Vec(2, UInt(parameter.datapathWidth.W))) - val ffoSuccess: Option[Bool] = Option.when(isLastSlot)(Bool()) - val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) + val data: UInt = UInt(parameter.datapathWidth.W) + val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) + val crossWriteData: Option[Vec[UInt]] = Option.when(isLastSlot)(Vec(2, UInt(parameter.datapathWidth.W))) + val zvkCrossWriteData: Option[Vec[UInt]] = + Option.when(isLastSlot && parameter.zvkEnable)(Vec(4, UInt(parameter.datapathWidth.W))) + val ffoSuccess: Option[Bool] = Option.when(isLastSlot)(Bool()) + val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) } class ExecutionBridgeRecordQueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { @@ -94,6 +98,12 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd // execution result from execute unit val executionResult = RegInit(0.U(parameter.datapathWidth.W)) val crossWriteLSB: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB0: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB1: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB2: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) val outStandingRequestSize: Int = 4.max(parameter.vfuInstantiateParameter.maxLatency + 3) val outStanding: UInt = RegInit(0.U(log2Ceil(outStandingRequestSize).W)) val outStandingUpdate: UInt = Mux(vfuRequest.fire, 1.U(outStanding.getWidth.W), (-1.S(outStanding.getWidth.W)).asUInt) @@ -166,6 +176,9 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd executionRecord.maskForFilter := enqueue.bits.maskForFilter executionRecord.source := enqueue.bits.src executionRecord.crossReadSource.foreach(_ := enqueue.bits.crossReadSource.get) + if (parameter.zvkEnable) { + executionRecord.zvkCrossReadSource.foreach(_ := enqueue.bits.zvkCrossReadSource.get) + } executionRecord.sSendResponse.foreach(_ := enqueue.bits.sSendResponse.get) executionRecord.groupCounter := enqueue.bits.groupCounter executionRecord.decodeResult := enqueue.bits.decodeResult @@ -178,10 +191,22 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd /** collapse the dual SEW size operand for cross read. it can be vd or src2. */ - val doubleCollapse: Option[UInt] = Option.when(isLastSlot) { + val doubleCollapse: Option[UInt] = Option.when(isLastSlot) { val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) Mux(executionRecord.executeIndex, cutCrossReadData(1), cutCrossReadData(0)) } + val quadrupleCollapse: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable) { + val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.zvkCrossReadSource.get, parameter.datapathWidth) + Mux1H( + UIntToOH(executionRecord.zvkExecuteIndex.get), + Seq( + cutCrossReadData(0), + cutCrossReadData(1), + cutCrossReadData(2), + cutCrossReadData(3) + ) + ) + } // For cross read, extend 32 bit source1 to 64 bit, then select by executeIndex def dataExtend(data: UInt, sign: Bool): UInt = { @@ -221,7 +246,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd ) } else { normalSource1 - } + } // TODO: vs1 cross val reduceFoldSource2: Option[UInt] = Option.when(isLastSlot)(Wire(UInt(parameter.datapathWidth.W))) @@ -229,8 +254,13 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd */ val finalSource2: UInt = if (isLastSlot) { Mux( - executionRecord.crossReadVS2, - doubleCollapse.get, + executionRecord.crossReadVS2, { + if (parameter.zvkEnable) { + Mux(executionRecord.decodeResult(Decoder.zvk), quadrupleCollapse.get, doubleCollapse.get) + } else { + doubleCollapse.get + } + }, Mux( executionRecord.decodeResult(Decoder.crossWrite) || (executionRecord.decodeResult( Decoder.widenReduce @@ -389,6 +419,17 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd crossWriteLSB.foreach { crossWriteData => crossWriteData := dataDequeue } + if (parameter.zvkEnable) { + zvkCrossWriteLSB0.foreach { crossWriteData => + crossWriteData := dataDequeue + } + zvkCrossWriteLSB1.zip(zvkCrossWriteLSB0).foreach { case (zvkCrossWriteData1, zvkCrossWriteData0) => + zvkCrossWriteData1 := zvkCrossWriteData0 + } + zvkCrossWriteLSB2.zip(zvkCrossWriteLSB1).foreach { case (zvkCrossWriteData2, zvkCrossWriteData1) => + zvkCrossWriteData2 := zvkCrossWriteData1 + } + } } /** update value for [[maskFormatResultForGroup]] */ @@ -576,6 +617,10 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd } queue.io.enq.bits.ffoIndex := recordQueue.io.deq.bits.groupCounter ## dataResponse.bits.data(4, 0) queue.io.enq.bits.crossWriteData.foreach(_ := VecInit((crossWriteLSB ++ Seq(dataDequeue)).toSeq)) + if (parameter.zvkEnable) { + queue.io.enq.bits.zvkCrossWriteData + .foreach(_ := VecInit((zvkCrossWriteLSB0 ++ zvkCrossWriteLSB1 ++ zvkCrossWriteLSB2 ++ Seq(dataDequeue)).toSeq)) + } queue.io.enq.bits.ffoSuccess.foreach(_ := dataResponse.bits.ffoSuccess) queue.io.enq.bits.fpReduceValid.foreach(_ := !waitFirstValidFire.get) recordQueue.io.deq.ready := dataResponse.valid || (recordNotExecute && queue.io.enq.ready) diff --git a/t1/src/laneStage/LaneStage1.scala b/t1/src/laneStage/LaneStage1.scala index 9cde79dde..2093a842c 100644 --- a/t1/src/laneStage/LaneStage1.scala +++ b/t1/src/laneStage/LaneStage1.scala @@ -11,7 +11,7 @@ import chisel3.ltl._ import chisel3.ltl.Sequence._ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.Decoder -import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, LaneState, VrfReadPipe} +import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, LaneState, VrfReadPipe, ZvkCrossReadUnit} class LaneStage1Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val groupCounter: UInt = UInt(parameter.groupNumberBits.W) @@ -49,6 +49,8 @@ class LaneStage1Dequeue(parameter: LaneParameter, isLastSlot: Boolean) extends B // read result val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = + Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) // pipe state // for exe stage @@ -87,8 +89,15 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val vrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(readCheckSize, Output(readRequestType))) + val zvkReadCheckSize: Int = if (isLastSlot && parameter.zvkEnable) 7 else 3 @public - val checkResult: Vec[Bool] = IO(Vec(readCheckSize, Input(Bool()))) + val zvkVrfCheckRequest: Option[Vec[VRFReadRequest]] = + Option.when(parameter.zvkEnable)(IO(Vec(zvkReadCheckSize, Output(readRequestType)))) + + @public + val checkResult: Vec[Bool] = IO(Vec(readCheckSize, Input(Bool()))) + @public + val zvkCheckResult: Option[Vec[Bool]] = Option.when(parameter.zvkEnable)(IO(Vec(zvkReadCheckSize, Input(Bool())))) /** VRF read result for each slot, 3 is for [[source1]] [[source2]] [[source3]] */ @@ -96,16 +105,26 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val vrfReadResult: Vec[UInt] = IO(Input(Vec(3, UInt(parameter.datapathWidth.W)))) @public - val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)( + val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)( IO( Vec(2, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) ) ) + @public + val zvkReadBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot & parameter.zvkEnable)( + IO( + Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) + ) + ) @public val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new ReadBusData(parameter))))) + @public + val zvkReadBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + val groupCounter: UInt = enqueue.bits.groupCounter // todo: param @@ -132,12 +151,30 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val queueAfterCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + // read request queue for cross read lsb & msb val queueBeforeCheckLSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) val queueBeforeCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + // pipe from enqueue val pipeQueue: Queue[LaneStage1Enqueue] = Module( @@ -156,11 +193,35 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val afterCheckQueueVec: Seq[Queue[VRFReadQueueEntry]] = Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ queueAfterCheckLSB ++ queueAfterCheckMSB - val allReadQueueReady: Bool = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + + val beforeCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueBeforeCheck1, queueBeforeCheck2, queueBeforeCheckVd) ++ + queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ + queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB + val afterCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ + queueAfterCheckZvkLSBLSB ++ queueAfterCheckZvkLSBMSB ++ + queueAfterCheckZvkMSBLSB ++ queueAfterCheckZvkMSBMSB + + val allReadQueueReady: Bool = { + val ready = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + if (parameter.zvkEnable) { + val zvkReady = beforeCheckZvkQueueVec.map(_.io.enq.ready).reduce(_ && _) + Mux(enqueue.bits.decodeResult(Decoder.crossRead) & enqueue.bits.decodeResult(Decoder.zvk), zvkReady, ready) + } else { + ready + } + } beforeCheckQueueVec.foreach { q => q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex q.io.enq.bits.groupIndex := enqueue.bits.groupCounter } + if (parameter.zvkEnable) { + beforeCheckZvkQueueVec.foreach { q => + q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex + q.io.enq.bits.groupIndex := enqueue.bits.groupCounter + } + } enqueue.ready := allReadQueueReady && pipeQueue.io.enq.ready @@ -171,6 +232,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { after.io.enq.valid := before.io.deq.valid && checkResult(i) after.io.enq.bits := before.io.deq.bits } + if (parameter.zvkEnable) { + beforeCheckZvkQueueVec.zip(afterCheckZvkQueueVec).zipWithIndex.foreach { case ((before, after), i) => + zvkVrfCheckRequest.get(i) := before.io.deq.bits + before.io.deq.ready := after.io.enq.ready && zvkCheckResult.get(i) + after.io.enq.valid := before.io.deq.valid && zvkCheckResult.get(i) + after.io.enq.bits := before.io.deq.bits + } + } // request enqueue queueBeforeCheck1.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.vtype) && !enqueue.bits.skipRead queueBeforeCheck2.io.enq.valid := enqueue.fire && !enqueue.bits.skipRead @@ -178,6 +247,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { (queueBeforeCheckLSB ++ queueBeforeCheckMSB).foreach { q => q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) } + if (parameter.zvkEnable) { + (queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB).foreach { + q => + q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult( + Decoder.crossRead + ) && enqueue.bits.decodeResult(Decoder.zvk) + } + } // calculate vs queueBeforeCheck1.io.enq.bits.vs := Mux( @@ -232,6 +309,29 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B } + if (parameter.zvkEnable) { // TODO: check here + queueBeforeCheckZvkLSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkLSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + } + // read pipe val readPipe0: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = false)) val readPipe1: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = isLastSlot)) @@ -247,8 +347,18 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val dataQueueVd: Queue[UInt] = Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize)) // cross lane queue - val dataQueueLSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) - val dataQueueMSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueLSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueMSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBLSB = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBMSB = Option.when(isLastSlot && parameter.zvkEnable)( + Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize)) + ) // TODO + val dataQueueZvkMSBLSB = Option.when(isLastSlot && parameter.zvkEnable)( + Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize)) + ) // TODO + val dataQueueZvkMSBMSB = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) val dataQueueNotFull2: Bool = { val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) @@ -304,17 +414,80 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) } + if (parameter.zvkEnable) { + readPipe1.contender.zip(queueAfterCheckZvkLSBLSB).foreach { case (port, queue) => + val dataQueueNotFullLSBLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkLSBLSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSBLSB) + } + readPipe1.contender.zip(queueAfterCheckZvkLSBMSB).foreach { case (port, queue) => + val dataQueueNotFullLSBMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkLSBMSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSBMSB) + } + readPipe2.contender.zip(queueAfterCheckZvkMSBLSB).foreach { case (port, queue) => + val dataQueueNotFullMSBLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkMSBLSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSBLSB) + } + readPipe2.contender.zip(queueAfterCheckZvkMSBMSB).foreach { case (port, queue) => + val dataQueueNotFullMSBMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkMSBMSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSBMSB) + } + } + // data: pipe <-> queue if (isLastSlot) { // pipe1 <-> dataQueueVs2 dataQueueVs2.io.enq <> readPipe1.dequeue // pipe1 <> dataQueueLSB dataQueueLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if (parameter.zvkEnable) { + dataQueueZvkLSBLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + dataQueueZvkLSBMSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } // pipe2 <-> dataQueueVd dataQueueVd.io.enq <> readPipe2.dequeue // pipe2 <-> dataQueueMSB dataQueueMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if (parameter.zvkEnable) { + dataQueueZvkMSBLSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + dataQueueZvkMSBMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } } else { dataQueueVs2.io.enq <> readPipe1.dequeue dataQueueVd.io.enq <> readPipe2.dequeue @@ -326,6 +499,13 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val crossReadStageFree: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) val crossReadUnitOp: Option[Instance[CrossReadUnit]] = Option.when(isLastSlot)(Instantiate(new CrossReadUnit(parameter))) + + val zvkCrossReadResultQueue: Option[Queue[UInt]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt((parameter.datapathWidth * 4).W), 1))) + val zvkCrossReadStageFree: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(Wire(Bool())) + val zvkCrossReadUnitOp: Option[Instance[ZvkCrossReadUnit]] = + Option.when(isLastSlot && parameter.zvkEnable)(Instantiate(new ZvkCrossReadUnit(parameter))) + if (isLastSlot) { val dataGroupQueue: Queue[UInt] = Module( @@ -351,7 +531,34 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { AssertProperty(BoolSequence(dataGroupQueue.io.enq.ready || !dataGroupQueue.io.enq.valid)) dataGroupQueue.io.enq.bits := enqueue.bits.groupCounter dataGroupQueue.io.deq.ready := crossReadUnit.dataInputLSB.fire - dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup + dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup // TODO: readBusDequeueGroup is currently unused + + if (parameter.zvkEnable) { + val zvkDataGroupQueue: Queue[UInt] = + Module( + new Queue( + UInt(parameter.groupNumberBits.W), + readRequestQueueSizeBeforeCheck + readRequestQueueSizeBeforeCheck + dataQueueSize + 2 + ) + ) + val zvkCrossReadUnit = zvkCrossReadUnitOp.get + zvkCrossReadUnit.dataInputLSBLSB <> dataQueueZvkLSBLSB.get.io.deq + zvkCrossReadUnit.dataInputLSBMSB <> dataQueueZvkLSBMSB.get.io.deq + zvkCrossReadUnit.dataInputMSBLSB <> dataQueueZvkMSBLSB.get.io.deq + zvkCrossReadUnit.dataInputMSBMSB <> dataQueueZvkMSBMSB.get.io.deq + zvkCrossReadUnit.laneIndex := laneIndexReg + zvkCrossReadUnit.dataGroup := zvkDataGroupQueue.io.deq.bits + zvkReadBusRequest.get.zip(zvkCrossReadUnit.readBusRequest.get).foreach { case (sink, source) => sink <> source } + zvkCrossReadUnit.readBusDequeue.get.zip(zvkReadBusDequeue.get).foreach { case (sink, source) => sink <> source } + zvkCrossReadResultQueue.get.io.enq <> zvkCrossReadUnit.crossReadDequeue + zvkCrossReadStageFree.get := zvkCrossReadUnit.crossReadStageFree + + // data group + zvkDataGroupQueue.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.crossRead) + assert(zvkDataGroupQueue.io.enq.ready || !zvkDataGroupQueue.io.enq.valid) + zvkDataGroupQueue.io.enq.bits := enqueue.bits.groupCounter + zvkDataGroupQueue.io.deq.ready := zvkCrossReadUnit.dataInputLSBLSB.fire + } } val source1Select: UInt = Mux( @@ -363,6 +570,9 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dequeue.bits.groupCounter := pipeQueue.io.deq.bits.groupCounter dequeue.bits.src := VecInit(Seq(source1Select, dataQueueVs2.io.deq.bits, dataQueueVd.io.deq.bits)) dequeue.bits.crossReadSource.foreach(_ := crossReadResultQueue.get.io.deq.bits) + if (parameter.zvkEnable) { + dequeue.bits.zvkCrossReadSource.foreach(_ := zvkCrossReadResultQueue.get.io.deq.bits) + } dequeue.bits.sSendResponse.foreach(_ := pipeQueue.io.deq.bits.sSendResponse.get) dequeue.bits.decodeResult := pipeQueue.io.deq.bits.decodeResult dequeue.bits.vSew1H := pipeQueue.io.deq.bits.vSew1H @@ -384,7 +594,15 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVs2.io.deq.valid || pipeQueue.io.deq.bits.skipRead, dataQueueVd.io.deq.valid || (pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD)) ) ++ - crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) + crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) ++ { + if (parameter.zvkEnable) + zvkCrossReadResultQueue.map( + _.io.deq.valid || + (!pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) && + !pipeQueue.io.deq.bits.decodeResult(Decoder.zvk)) + ) + else Seq() + } val allDataQueueValid: Bool = VecInit(dataQueueValidVec).asUInt.andR dequeue.valid := allDataQueueValid && pipeQueue.io.deq.valid dataQueueVs1.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.vtype) @@ -394,28 +612,59 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { crossReadResultQueue.foreach( _.io.deq.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) ) + if (parameter.zvkEnable) { + zvkCrossReadResultQueue.foreach( + _.io.deq.ready := allDataQueueValid && dequeue.ready && (pipeQueue.io.deq.bits + .decodeResult(Decoder.crossRead) && pipeQueue.io.deq.bits.decodeResult(Decoder.zvk)) + ) + } stageValid := pipeQueue.io.deq.valid val stageFinish = !stageValid // TODO: gather these logic into a Probe Bundle @public - val dequeueReadyProbe = IO(Output(Probe(Bool(), layers.Verification))) + val dequeueReadyProbe = IO(Output(Probe(Bool(), layers.Verification))) + @public + val dequeueValidProbe = IO(Output(Probe(Bool(), layers.Verification))) @public - val dequeueValidProbe = IO(Output(Probe(Bool(), layers.Verification))) + val hasDataOccupiedProbe = IO(Output(Probe(Bool(), layers.Verification))) @public - val hasDataOccupiedProbe = IO(Output(Probe(Bool(), layers.Verification))) + val stageFinishProbe = IO(Output(Probe(Bool(), layers.Verification))) @public - val stageFinishProbe = IO(Output(Probe(Bool(), layers.Verification))) + val readFinishProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) @public - val readFinishProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + val sSendCrossReadResultLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) @public - val sSendCrossReadResultLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + val sSendCrossReadResultMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) @public - val sSendCrossReadResultMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + val sSendZvkCrossReadResultLSBLSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) @public - val wCrossReadLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + val sSendZvkCrossReadResultLSBMSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) @public - val wCrossReadMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + val sSendZvkCrossReadResultMSBLSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val sSendZvkCrossReadResultMSBMSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) + + @public + val wCrossReadLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val wCrossReadMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val wZvkCrossReadLSBLSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val wZvkCrossReadLSBMSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val wZvkCrossReadMSBLSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) + @public + val wZvkCrossReadMSBMSBProbe = + Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool(), layers.Verification)))) @public val vrfReadRequestProbe: Seq[(Bool, Bool)] = Seq.fill(3)((IO(Output(Probe(Bool(), layers.Verification))), IO(Output(Probe(Bool(), layers.Verification))))) @@ -436,6 +685,33 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { ) wCrossReadLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadLSB))) wCrossReadMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadMSB))) + + if (parameter.zvkEnable) { + sSendZvkCrossReadResultLSBLSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(0))) + ) + sSendZvkCrossReadResultLSBMSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(1))) + ) + sSendZvkCrossReadResultMSBLSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(2))) + ) + sSendZvkCrossReadResultMSBMSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(3))) + ) + wZvkCrossReadLSBLSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(0))) + ) + wZvkCrossReadLSBMSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(1))) + ) + wZvkCrossReadMSBLSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(2))) + ) + wZvkCrossReadMSBMSBProbe.foreach(p => + define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(3))) + ) + } } vrfReadRequestProbe.zipWithIndex.foreach { case ((ready, valid), i) => diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index f84c54806..64822994b 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -11,21 +11,22 @@ import org.chipsalliance.t1.rtl.decoder.Decoder import org.chipsalliance.t1.rtl._ class LaneStage3Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { - val groupCounter: UInt = UInt(parameter.groupNumberBits.W) - val data: UInt = UInt(parameter.datapathWidth.W) - val pipeData: UInt = UInt(parameter.datapathWidth.W) - val mask: UInt = UInt((parameter.datapathWidth / 8).W) - val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) - val crossWriteData: Vec[UInt] = Vec(2, UInt(parameter.datapathWidth.W)) - val sSendResponse: Bool = Bool() - val ffoSuccess: Bool = Bool() - val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + val data: UInt = UInt(parameter.datapathWidth.W) + val pipeData: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) + val crossWriteData: Vec[UInt] = Vec(2, UInt(parameter.datapathWidth.W)) + val zvkCrossWriteData: Option[Vec[UInt]] = Option.when(parameter.zvkEnable)(Vec(4, UInt(parameter.datapathWidth.W))) + val sSendResponse: Bool = Bool() + val ffoSuccess: Bool = Bool() + val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) // pipe state - val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) - val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) // Need real-time status, no pipe - val ffoByOtherLanes: Bool = Bool() - val loadStore: Bool = Bool() + val ffoByOtherLanes: Bool = Bool() + val loadStore: Bool = Bool() /** vd or rd */ val vd: UInt = UInt(5.W) @@ -59,6 +60,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) + @public + val zvkCrossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B)) @@ -68,6 +72,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** schedule cross lane write MSB */ val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + val sZvkCrossWriteLSBLSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteLSBMSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteMSBLSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteMSBMSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + // state for response to scheduler /** schedule send [[LaneResponse]] to scheduler */ val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) @@ -79,6 +88,10 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { when(enqueue.fire) { pipeEnqueue.foreach(_ := enqueue.bits) (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite)) + if (parameter.zvkEnable) { + (sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB) + .foreach(_ := !(enqueue.bits.decodeResult(Decoder.crossWrite) & enqueue.bits.decodeResult(Decoder.zvk))) + } (sSendResponse ++ wResponseFeedback).foreach( _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse ) @@ -111,6 +124,20 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } + if (parameter.zvkEnable) { + val zvkSendState = + (sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB).toSeq + zvkCrossWritePort.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValidReg.get && !zvkSendState(index) + port.bits.mask := 0.U((parameter.datapathWidth / 2 / 8).W) // Note: leave it for empty + port.bits.data := pipeEnqueue.get.zvkCrossWriteData.get(index) + port.bits.counter := pipeEnqueue.get.groupCounter + port.bits.instructionIndex := pipeEnqueue.get.instructionIndex + when(port.fire) { + zvkSendState(index) := true.B + } + } + } // scheduler synchronization val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) @@ -163,7 +190,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // Handshake /** Cross-lane writing is over */ - val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) + val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB ++ + sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ + sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB).reduce(_ && _) enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index b2ed09a4e..a11cf55ec 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -72,6 +72,10 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val crossWriteReports: Vec[ValidIO[UInt]] = IO(Vec(2, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + @public + val zvkCrossWriteReports: Option[Vec[ValidIO[UInt]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, Flipped(Valid(UInt(parameter.instructionIndexBits.W)))))) + @public val responseReport: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @@ -143,10 +147,18 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val pendingSlotWrite = tokenUpdate(writeToken, writeDoEnq, writeDoDeq) if (slotIndex == 0) { - val responseToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val feedbackToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val crossWriteTokenLSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) - val crossWriteTokenMSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val responseToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val feedbackToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val crossWriteTokenLSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val crossWriteTokenMSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val zvkCrossWriteTokenLSBLSB: Option[Seq[UInt]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenLSBMSB: Option[Seq[UInt]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenMSBLSB: Option[Seq[UInt]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenMSBMSB: Option[Seq[UInt]] = + Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) // Feedback is not accurate (index load/store may have already finished the instruction) val responseIndexQueue = @@ -165,9 +177,51 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val crossWriteDeqMSB = maskAnd(crossWriteReports.last.valid, indexToOH(crossWriteReports.last.bits, parameter.chainingSize)).asUInt + val zvkCrossWriteDeqLSBLSB = + Option.when(parameter.zvkEnable)( + maskAnd( + zvkCrossWriteReports.get.head.valid, + indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize) + ).asUInt + ) + val zvkCrossWriteDeqLSBMSB = + Option.when(parameter.zvkEnable)( + maskAnd( + zvkCrossWriteReports.get.head.valid, + indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize) + ).asUInt + ) + val zvkCrossWriteDeqMSBLSB = + Option.when(parameter.zvkEnable)( + maskAnd( + zvkCrossWriteReports.get.head.valid, + indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize) + ).asUInt + ) + val zvkCrossWriteDeqMSBMSB = + Option.when(parameter.zvkEnable)( + maskAnd( + zvkCrossWriteReports.get.last.valid, + indexToOH(zvkCrossWriteReports.get.last.bits, parameter.chainingSize) + ).asUInt + ) + val pendingCrossWriteLSB = tokenUpdate(crossWriteTokenLSB, crossWriteDoEnq, crossWriteDeqLSB) val pendingCrossWriteMSB = tokenUpdate(crossWriteTokenMSB, crossWriteDoEnq, crossWriteDeqMSB) + val zvkPendingCrossWriteLSBLSB = Option.when(parameter.zvkEnable)( + tokenUpdate(zvkCrossWriteTokenLSBLSB.get, crossWriteDoEnq, zvkCrossWriteDeqLSBLSB.get) + ) + val zvkPendingCrossWriteLSBMSB = Option.when(parameter.zvkEnable)( + tokenUpdate(zvkCrossWriteTokenLSBMSB.get, crossWriteDoEnq, zvkCrossWriteDeqLSBMSB.get) + ) + val zvkPendingCrossWriteMSBLSB = Option.when(parameter.zvkEnable)( + tokenUpdate(zvkCrossWriteTokenMSBLSB.get, crossWriteDoEnq, zvkCrossWriteDeqMSBLSB.get) + ) + val zvkPendingCrossWriteMSBMSB = Option.when(parameter.zvkEnable)( + tokenUpdate(zvkCrossWriteTokenMSBMSB.get, crossWriteDoEnq, zvkCrossWriteDeqMSBMSB.get) + ) + // response & feedback update val responseDoEnq: UInt = maskAnd(enqReport.valid && !enqReport.bits.sSendResponse, enqOH).asUInt @@ -186,7 +240,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq) // todo: Precise feedback val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq) - pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback + if (parameter.zvkEnable) { + pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | zvkPendingCrossWriteLSBLSB.get | zvkPendingCrossWriteLSBMSB.get | zvkPendingCrossWriteMSBLSB.get | zvkPendingCrossWriteMSBMSB.get | pendingResponse | pendingFeedback + } else { + pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback + } } else { pendingSlotWrite } diff --git a/t1/src/laneStage/ZvkCrossReadUnit.scala b/t1/src/laneStage/ZvkCrossReadUnit.scala new file mode 100644 index 000000000..199e7ac2a --- /dev/null +++ b/t1/src/laneStage/ZvkCrossReadUnit.scala @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.{LaneParameter, ReadBusData} + +class ZvkCrossReadState extends Bundle { + val sSendCrossReadResult: Vec[Bool] = Vec(4, Bool()) + val wCrossRead: Vec[Bool] = Vec(4, Bool()) +} + +@instantiable +class ZvkCrossReadUnit(parameter: LaneParameter) extends Module { + @public + val dataInputLSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputLSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val laneIndex: UInt = IO(Input(UInt(parameter.laneNumberBits.W))) + @public + val dataGroup: UInt = IO(Input(UInt(parameter.groupNumberBits.W))) + @public + val currentGroup: UInt = IO(Output(UInt(parameter.groupNumberBits.W))) + + @public + val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)( + IO(Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter))))) + ) + @public + val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + + @public + val crossReadDequeue: DecoupledIO[UInt] = IO(Decoupled(UInt((parameter.datapathWidth * 4).W))) + @public + val crossReadStageFree: Bool = IO(Output(Bool())) + @public + val crossWriteState = IO(Output(new ZvkCrossReadState)) + + val stageValid: Bool = RegInit(false.B) + val sSendCrossReadResultLSBLSB, sSendCrossReadResultMSBLSB, wCrossReadLSBLSB, wCrossReadMSBLSB = RegInit(true.B) + val sSendCrossReadResultLSBMSB, sSendCrossReadResultMSBMSB, wCrossReadLSBMSB, wCrossReadMSBMSB = RegInit(true.B) + val stateVec: Seq[Bool] = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB, + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB + ) + val sendDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val groupCounter: UInt = RegInit(0.U(parameter.groupNumberBits.W)) + val receiveDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val sendState = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB + ) + val receiveState = Seq( + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB + ) + + readBusRequest.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValid && !sendState(index) + port.bits.data := sendDataVec(index) + when(port.fire) { sendState(index) := true.B } + } + + readBusDequeue.get.zipWithIndex.foreach { case (port, index) => + when(port.fire) { + receiveState(index) := true.B + receiveDataVec(index) := port.bits.data + } + port.ready := !receiveState(index) + } + val allStateReady: Bool = stateVec.reduce(_ && _) + val stageReady: Bool = !stageValid || (allStateReady && crossReadDequeue.ready) + val allSourceValid: Bool = Seq( + dataInputLSBLSB.valid, + dataInputLSBMSB.valid, + dataInputMSBLSB.valid, + dataInputMSBMSB.valid + ).reduce(_ && _) + val enqueueFire: Bool = stageReady && allSourceValid + dataInputLSBLSB.ready := allSourceValid && stageReady + dataInputLSBMSB.ready := allSourceValid && stageReady + dataInputMSBLSB.ready := allSourceValid && stageReady + dataInputMSBMSB.ready := allSourceValid && stageReady + + when(enqueueFire ^ crossReadDequeue.fire) { + stageValid := enqueueFire + } + when(enqueueFire) { + stateVec.foreach(_ := false.B) + sendDataVec := VecInit( + Seq( + dataInputLSBLSB.bits, + dataInputLSBMSB.bits, + dataInputMSBLSB.bits, + dataInputMSBMSB.bits + ) + ) + groupCounter := dataGroup + } + currentGroup := groupCounter + crossReadDequeue.bits := receiveDataVec.asUInt + crossReadDequeue.valid := allStateReady && stageValid + crossReadStageFree := (!stageValid) && stateVec.reduce(_ && _) + + crossWriteState.sSendCrossReadResult(0) := sSendCrossReadResultLSBLSB + crossWriteState.sSendCrossReadResult(1) := sSendCrossReadResultLSBMSB + crossWriteState.sSendCrossReadResult(2) := sSendCrossReadResultMSBLSB + crossWriteState.sSendCrossReadResult(3) := sSendCrossReadResultMSBMSB + crossWriteState.wCrossRead(0) := wCrossReadLSBLSB + crossWriteState.wCrossRead(1) := wCrossReadLSBMSB + crossWriteState.wCrossRead(2) := wCrossReadMSBLSB + crossWriteState.wCrossRead(3) := wCrossReadMSBMSB +} diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 3a4c0441b..9e1332606 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -65,6 +65,7 @@ case class VRFParam( datapathWidth: Int, chainingSize: Int, portFactor: Int, + zvkEnable: Boolean, ramType: RamType) extends SerializableModuleParameter { @@ -155,7 +156,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar // then, we can know the accuracy read&write check hardware signal. // 3 * slot + 2 cross read @public - val readCheck: Vec[VRFReadRequest] = IO( + val readCheck: Vec[VRFReadRequest] = IO( Vec( parameter.chainingSize * 3 + 2, Input( @@ -163,9 +164,23 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar ) ) ) + @public + val zvkReadCheck: Option[Vec[VRFReadRequest]] = Option.when(parameter.zvkEnable)( + IO( + Vec( + parameter.chainingSize * 3 + 4, + Input( + new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + ) + ) + ) + ) @public - val readCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 2, Output(Bool()))) + val readCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 2, Output(Bool()))) + @public + val zvkReadCheckResult: Option[Vec[Bool]] = + Option.when(parameter.zvkEnable)(IO(Vec(parameter.chainingSize * 3 + 4, Output(Bool())))) /** VRF read results. */ @public @@ -302,6 +317,27 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar .reduce(_ && _) } + if (parameter.zvkEnable) { + zvkReadCheck.get.zip(zvkReadCheckResult.get).foreach { case (req, res) => + val recordSelect = chainingRecord + val readRecord = + Mux1H(recordSelect.map(_.bits.instIndex === req.instructionIndex), recordSelect.map(_.bits)) + res := + recordSelect + .zip(recordValidVec) + .zipWithIndex + .map { case ((r, f), recordIndex) => + val checkModule = Instantiate(new ChainingCheck(parameter)) + checkModule.read := req + checkModule.readRecord := readRecord + checkModule.record := r + checkModule.recordValid := f + checkModule.checkResult + } + .reduce(_ && _) + } + } + val checkSize: Int = readRequests.size val (firstOccupied, secondOccupied) = readRequests.zipWithIndex.foldLeft( (0.U(parameter.rfBankNum.W), 0.U(parameter.rfBankNum.W)) diff --git a/t1rocket/src/T1RocketTile.scala b/t1rocket/src/T1RocketTile.scala index 8901f28d4..d91a98856 100644 --- a/t1rocket/src/T1RocketTile.scala +++ b/t1rocket/src/T1RocketTile.scala @@ -408,7 +408,8 @@ case class T1RocketTileParameter( ), floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), - zvbbModuleParameters = Seq() + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq() ) else VFUInstantiateParameter( @@ -442,7 +443,8 @@ case class T1RocketTileParameter( ) ), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq() ) def t1Parameter: T1Parameter = T1Parameter(