Xilinx · niwinanto · Jan 17, 2025 · Jan 17, 2025 · khallouh · Jan 18, 2025
@@ -300,3 +300,6 @@ BUILTIN(__builtin_aie2p_tanh, "V16yV16g", "nc")
 
 //division/mod
 BUILTIN(__builtin_aie2p_divstep, "vUi&Ui&Ui", "nc")
+
+// SHUFFLE
+BUILTIN(__builtin_aie2p_vshuffle_576_bfp16, "vV64cV8cV64cV8ciV64c&V8c&", "nc")
@@ -22048,6 +22048,8 @@ static llvm::Intrinsic::ID getAIE2PIntrinsicFunction(unsigned BuiltinID) {
     return Intrinsic::aie2p_vsub_lt32;
   case AIE::BI__builtin_aie2p_divstep:
     return Intrinsic::aie2p_divs;
+  case AIE::BI__builtin_aie2p_vshuffle_576_bfp16:
+    return Intrinsic::aie2p_vshuffle_576_bfp16;
   default:
     break;
   }
@@ -22274,6 +22276,30 @@ Value *CodeGenFunction::EmitAIEBuiltinExpr(unsigned BuiltinID,
     Value *DivAddr = EmitLValue(E->getArg(1)).getPointer(*this);
     return Builder.CreateDefaultAlignedStore(Div, DivAddr);
   }
+  case AIE::BI__builtin_aie2p_vshuffle_576_bfp16: {
+    SmallVector<Value *, 3> Ops;
+    for (unsigned I = 0; I < E->getNumArgs() - 2; I++)
+      Ops.push_back(EmitScalarExpr(E->getArg(I)));
+
+    llvm::Intrinsic::ID IntrinsicID = getAIEIntrinsicFunction(BuiltinID, Arch);
+    assert(IntrinsicID != Intrinsic::not_intrinsic);
+    Function *F = CGM.getIntrinsic(IntrinsicID);
+    Value *Val = Builder.CreateCall(F, Ops);
+
+    // The first member of the returned struct is the mantissa part of bfp16,
+    // store it to the first input reference
+    Value *Mant = Builder.CreateExtractValue(Val, 0);
+    Value *MantAddr =
+        EmitLValue(E->getArg(E->getNumArgs() - 2)).getPointer(*this);
+    Builder.CreateDefaultAlignedStore(Mant, MantAddr);
+
+    // The second member of the returned struct is the exponent part of bfp16
+    // store it to the second input reference
+    Value *Exp = Builder.CreateExtractValue(Val, 1);
+    Value *ExpAddr =
+        EmitLValue(E->getArg(E->getNumArgs() - 1)).getPointer(*this);
+    return Builder.CreateDefaultAlignedStore(Exp, ExpAddr);
+  }
   default:
     break;
   }
@@ -22452,7 +22478,8 @@ Value *CodeGenFunction::EmitAIE2PBuiltinExpr(unsigned BuiltinID,
   case AIE::BI__builtin_aie2p_vsub_lt8:
   case AIE::BI__builtin_aie2p_vsub_lt16:
   case AIE::BI__builtin_aie2p_vsub_lt32:
-  case AIE::BI__builtin_aie2p_divstep: {
+  case AIE::BI__builtin_aie2p_divstep:
+  case AIE::BI__builtin_aie2p_vshuffle_576_bfp16: {
     return this->EmitAIEBuiltinExpr(BuiltinID, E, Arch);
   }
   default:

@@ -812,10 +812,6 @@ inline __attribute__((always_inline)) v256uint8_sparse shuffle(v256uint8_sparse
 inline __attribute__((always_inline)) v128uint16_sparse shuffle(v128uint16_sparse , int );
 inline __attribute__((always_inline)) v128uint8_sparse shuffle(v128uint8_sparse , int );
 inline __attribute__((always_inline)) v64uint16_sparse shuffle(v64uint16_sparse , int );
-inline __attribute__((always_inline)) v64bfp16ebs8 shuffle(v64bfp16ebs8 , v64bfp16ebs8 , unsigned int );
-inline __attribute__((always_inline)) v64bfp16ebs16 shuffle(v64bfp16ebs16 , v64bfp16ebs16 , unsigned int );
-inline __attribute__((always_inline)) v64bfp16ebs8 shuffle(v64bfp16ebs8 , unsigned int );
-inline __attribute__((always_inline)) v64bfp16ebs16 shuffle(v64bfp16ebs16 , unsigned int );
 inline __attribute__((always_inline)) v128bfp16ebs8 shuffle(v128bfp16ebs8 , unsigned int );
 inline __attribute__((always_inline)) v128bfp16ebs16_sparse shuffle(v128bfp16ebs16_sparse , unsigned int );
 inline __attribute__((always_inline)) v256bfp16ebs16_sparse shuffle(v256bfp16ebs16_sparse , unsigned int );

@@ -1806,4 +1806,32 @@ INTRINSIC(v64accfloat) broadcast_zero_to_v64accfloat() {
   return __builtin_bit_cast(v64accfloat, (float)0 - v64float{0});
 }
 
+INTRINSIC(v64bfp16ebs8)
+shuffle(v64bfp16ebs8 a, v64bfp16ebs8 b, unsigned int mode) {
+  v64bfp16ebs8 res;
+  __builtin_aie2p_vshuffle_576_bfp16(a.mantissa, a.exponent, b.mantissa,
+                                     b.exponent, mode, (v64char &)res.mantissa,
+                                     (v8char &)res.exponent);
+  return res;
+}
+
+INTRINSIC(v64bfp16ebs16)
+shuffle(v64bfp16ebs16 a, v64bfp16ebs16 b, unsigned int mode) {
+  v64bfp16ebs16 res;
+  __builtin_aie2p_vshuffle_576_bfp16(a.mantissa, a.exponent, b.mantissa,
+                                     b.exponent, mode, (v64char &)res.mantissa,
+                                     (v8char &)res.exponent);
+  return res;
+}
+
+INTRINSIC(v64bfp16ebs8) shuffle(v64bfp16ebs8 a, unsigned mode) {
+  v64bfp16ebs8 unDef;
+  return shuffle(a, unDef, mode);
+}
+
+INTRINSIC(v64bfp16ebs16) shuffle(v64bfp16ebs16 a, unsigned mode) {
+  v64bfp16ebs16 unDef;
+  return shuffle(a, unDef, mode);
+}
+
 #endif /*__AIEV2_SCL2VEC_H__*/
@@ -171,6 +171,8 @@ typedef int16_t v32int16 __attribute__((__vector_size__(64)))
 __attribute__((aligned(__MIN_ALIGNMENT)));
 typedef int8_t v64int8 __attribute__((__vector_size__(64)))
 __attribute__((aligned(__MIN_ALIGNMENT)));
+typedef char v64char __attribute__((__vector_size__(64)))
+__attribute__((aligned(__MIN_ALIGNMENT)));
 typedef uint32_t v8uint64 __attribute__((__vector_size__(64)))
 __attribute__((aligned(__MIN_ALIGNMENT)));
 typedef uint32_t v16uint32 __attribute__((__vector_size__(64)))
@@ -246,6 +248,7 @@ typedef int16_t v4int16 __attribute__((__vector_size__(8)));
 typedef uint16_t v4uint16 __attribute__((__vector_size__(8)));
 typedef uint8_t v8uint8 __attribute__((__vector_size__(8)));
 typedef int8_t v8int8 __attribute__((__vector_size__(8)));
+typedef char v8char __attribute__((__vector_size__(8)));
 typedef buint8_t v16uint4 __attribute__((__vector_size__(8)));
 typedef bint8_t v16int4 __attribute__((__vector_size__(8)));
 /* vector types */

@@ -6,7 +6,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 
@@ -212,7 +212,6 @@ v16int32 test_broadcast_elem_128(v16int32 a, int b){
   v128uint4 test_upd_elem(v128uint4 v, int idx, v2uint4 b) {
     return upd_elem(v, idx, b);
   }
-//
 // AIE2P-LABEL: define dso_local noundef <64 x i8> @_Z13test_upd_elemDv64_DU8_iDv2_S_(
 // AIE2P-SAME: <64 x i8> noundef [[V:%.*]], i32 noundef [[IDX:%.*]], <2 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIE2P-NEXT:  entry:
@@ -233,7 +232,6 @@ v16int32 test_broadcast_elem_128(v16int32 a, int b){
   v128uint4 test_upd_elem(v128uint4 v, int idx, v8uint4 b) {
     return upd_elem(v, idx, b);
   }
-//
 // AIE2P-LABEL: define dso_local noundef <64 x i8> @_Z13test_upd_elemDv64_DU8_iDv8_S_(
 // AIE2P-SAME: <64 x i8> noundef [[V:%.*]], i32 noundef [[IDX:%.*]], <8 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIE2P-NEXT:  entry:
@@ -260,7 +258,6 @@ v16int32 test_broadcast_elem_128(v16int32 a, int b){
   v64uint8 test_upd_elem(v64uint8 v, int idx, unsigned char b) {
     return upd_elem(v, idx, b);
   }
-//
 // AIE2P-LABEL: define dso_local noundef <64 x i8> @_Z13test_upd_elemDv64_hiDv2_h(
 // AIE2P-SAME: <64 x i8> noundef [[V:%.*]], i32 noundef [[IDX:%.*]], <2 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIE2P-NEXT:  entry:
@@ -332,7 +329,6 @@ v16int32 test_broadcast_elem_128(v16int32 a, int b){
   v32uint16 test_upd_elem(v32uint16 v, int idx, v4uint16 b) {
     return upd_elem(v, idx, b);
   }
-//
 // AIE2P-LABEL: define dso_local noundef <16 x i32> @_Z13test_upd_elemDv16_jij(
 // AIE2P-SAME: <16 x i32> noundef [[V:%.*]], i32 noundef [[IDX:%.*]], i32 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIE2P-NEXT:  entry:
@@ -624,6 +620,7 @@ v16int32 test_shuffle_u64(mask64 b, unsigned int m) {
   return shuffle_u64(b, m);
 }
 
+//
 // AIE2P-LABEL: define dso_local noundef <16 x i32> @_Z11test_insertDv16_iiDv2_j(
 // AIE2P-SAME: <16 x i32> noundef [[V:%.*]], i32 noundef [[IDX:%.*]], <2 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIE2P-NEXT:  entry:
@@ -729,3 +726,71 @@ v64accfloat
 test_broadcast_zero_to_v64accfloat() {
   return broadcast_zero_to_v64accfloat();
 }
+
+// AIE2P-LABEL: define dso_local %struct.v64bfp16ebs8 @_Z12shuffle_test12v64bfp16ebs8S_j(
+// AIE2P-SAME: [[STRUCT_V64BFP16EBS8:%.*]] [[A_COERCE:%.*]], [[STRUCT_V64BFP16EBS8]] [[B_COERCE:%.*]], i32 noundef [[MODE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AIE2P-NEXT:  entry:
+// AIE2P-NEXT:    [[A_COERCE_FCA_0_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[A_COERCE]], 0
+// AIE2P-NEXT:    [[A_COERCE_FCA_1_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[A_COERCE]], 1
+// AIE2P-NEXT:    [[B_COERCE_FCA_0_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[B_COERCE]], 0
+// AIE2P-NEXT:    [[B_COERCE_FCA_1_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[B_COERCE]], 1
+// AIE2P-NEXT:    [[TMP0:%.*]] = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> [[A_COERCE_FCA_0_EXTRACT_I]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT_I]], <64 x i8> [[B_COERCE_FCA_0_EXTRACT_I]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT_I]], i32 [[MODE]])
+// AIE2P-NEXT:    [[TMP1:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 0
+// AIE2P-NEXT:    [[TMP2:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 1
+// AIE2P-NEXT:    [[DOTFCA_0_INSERT_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS8]] poison, <64 x i8> [[TMP1]], 0
+// AIE2P-NEXT:    [[DOTFCA_1_INSERT_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS8]] [[DOTFCA_0_INSERT_I]], <8 x i8> [[TMP2]], 1
+// AIE2P-NEXT:    ret [[STRUCT_V64BFP16EBS8]] [[DOTFCA_1_INSERT_I]]
+//
+v64bfp16ebs8 shuffle_test(v64bfp16ebs8 a, v64bfp16ebs8 b, unsigned int mode) {
+  return shuffle(a, b, mode);
+}
+
+// AIE2P-LABEL: define dso_local %struct.v64bfp16ebs16 @_Z12shuffle_test13v64bfp16ebs16S_j(
+// AIE2P-SAME: [[STRUCT_V64BFP16EBS16:%.*]] [[A_COERCE:%.*]], [[STRUCT_V64BFP16EBS16]] [[B_COERCE:%.*]], i32 noundef [[MODE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AIE2P-NEXT:  entry:
+// AIE2P-NEXT:    [[A_COERCE_FCA_0_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[A_COERCE]], 0
+// AIE2P-NEXT:    [[A_COERCE_FCA_1_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[A_COERCE]], 1
+// AIE2P-NEXT:    [[B_COERCE_FCA_0_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[B_COERCE]], 0
+// AIE2P-NEXT:    [[B_COERCE_FCA_1_EXTRACT_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[B_COERCE]], 1
+// AIE2P-NEXT:    [[TMP0:%.*]] = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> [[A_COERCE_FCA_0_EXTRACT_I]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT_I]], <64 x i8> [[B_COERCE_FCA_0_EXTRACT_I]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT_I]], i32 [[MODE]])
+// AIE2P-NEXT:    [[TMP1:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 0
+// AIE2P-NEXT:    [[TMP2:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 1
+// AIE2P-NEXT:    [[DOTFCA_0_INSERT_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS16]] poison, <64 x i8> [[TMP1]], 0
+// AIE2P-NEXT:    [[DOTFCA_1_INSERT_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS16]] [[DOTFCA_0_INSERT_I]], <8 x i8> [[TMP2]], 1
+// AIE2P-NEXT:    ret [[STRUCT_V64BFP16EBS16]] [[DOTFCA_1_INSERT_I]]
+//
+v64bfp16ebs16 shuffle_test(v64bfp16ebs16 a, v64bfp16ebs16 b, unsigned int mode) {
+  return shuffle(a, b, mode);
+}
+
+// AIE2P-LABEL: define dso_local %struct.v64bfp16ebs8 @_Z12shuffle_test12v64bfp16ebs8j(
+// AIE2P-SAME: [[STRUCT_V64BFP16EBS8:%.*]] [[A_COERCE:%.*]], i32 noundef [[MODE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AIE2P-NEXT:  entry:
+// AIE2P-NEXT:    [[A_COERCE_FCA_0_EXTRACT_I_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[A_COERCE]], 0
+// AIE2P-NEXT:    [[A_COERCE_FCA_1_EXTRACT_I_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS8]] [[A_COERCE]], 1
+// AIE2P-NEXT:    [[TMP0:%.*]] = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> [[A_COERCE_FCA_0_EXTRACT_I_I]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT_I_I]], <64 x i8> undef, <8 x i8> undef, i32 [[MODE]])
+// AIE2P-NEXT:    [[TMP1:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 0
+// AIE2P-NEXT:    [[TMP2:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 1
+// AIE2P-NEXT:    [[DOTFCA_0_INSERT_I_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS8]] poison, <64 x i8> [[TMP1]], 0
+// AIE2P-NEXT:    [[DOTFCA_1_INSERT_I_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS8]] [[DOTFCA_0_INSERT_I_I]], <8 x i8> [[TMP2]], 1
+// AIE2P-NEXT:    ret [[STRUCT_V64BFP16EBS8]] [[DOTFCA_1_INSERT_I_I]]
+//
+v64bfp16ebs8 shuffle_test(v64bfp16ebs8 a, unsigned mode) {
+  return shuffle(a ,mode);
+}
+
+// AIE2P-LABEL: define dso_local %struct.v64bfp16ebs16 @_Z12shuffle_test13v64bfp16ebs16j(
+// AIE2P-SAME: [[STRUCT_V64BFP16EBS16:%.*]] [[A_COERCE:%.*]], i32 noundef [[MODE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AIE2P-NEXT:  entry:
+// AIE2P-NEXT:    [[A_COERCE_FCA_0_EXTRACT_I_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[A_COERCE]], 0
+// AIE2P-NEXT:    [[A_COERCE_FCA_1_EXTRACT_I_I:%.*]] = extractvalue [[STRUCT_V64BFP16EBS16]] [[A_COERCE]], 1
+// AIE2P-NEXT:    [[TMP0:%.*]] = tail call { <64 x i8>, <8 x i8> } @llvm.aie2p.vshuffle.576.bfp16(<64 x i8> [[A_COERCE_FCA_0_EXTRACT_I_I]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT_I_I]], <64 x i8> undef, <8 x i8> undef, i32 [[MODE]])
+// AIE2P-NEXT:    [[TMP1:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 0
+// AIE2P-NEXT:    [[TMP2:%.*]] = extractvalue { <64 x i8>, <8 x i8> } [[TMP0]], 1
+// AIE2P-NEXT:    [[DOTFCA_0_INSERT_I_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS16]] poison, <64 x i8> [[TMP1]], 0
+// AIE2P-NEXT:    [[DOTFCA_1_INSERT_I_I:%.*]] = insertvalue [[STRUCT_V64BFP16EBS16]] [[DOTFCA_0_INSERT_I_I]], <8 x i8> [[TMP2]], 1
+// AIE2P-NEXT:    ret [[STRUCT_V64BFP16EBS16]] [[DOTFCA_1_INSERT_I_I]]
+//
+v64bfp16ebs16 shuffle_test(v64bfp16ebs16 a, unsigned mode) {
+  return shuffle(a,  mode);
+}
@@ -572,4 +572,10 @@ def int_aie2p_sqrtf : ClangBuiltin<"__builtin_aie2p_sqrtf">, AIE2PNLF;
 // DIVS
 def int_aie2p_divs : AIE2PDIVS;
 
+// BFP16 MAC MUL
+class AIE2PSHUFFLEBFP16
+      : Intrinsic<[llvm_v64i8_ty, llvm_v8i8_ty], [llvm_v64i8_ty, llvm_v8i8_ty, llvm_v64i8_ty, llvm_v8i8_ty, llvm_i32_ty],
+					 [IntrNoMem]>;
+def int_aie2p_vshuffle_576_bfp16 :  AIE2PSHUFFLEBFP16;
+
 } // TargetPrefix = "aie2p"
@@ -12,4 +12,5 @@
 //===----------------------------------------------------------------------===//
 
 include "AIEBaseRegisterBanks.td"
+def GPRRegBank : RegisterBank<"GPRRegBank", [eR, eL]>;
 def AccRegBank : RegisterBank<"AccRegBank", [ACC256, ACC512, ACC1024]>;
@@ -134,11 +134,15 @@ bool AIEBaseInstructionSelector::selectG_IMPLICIT_DEF(
   // Make sure no input operands are passed to IMPLICIT_DEF
   while (I.getNumOperands() > 1)
     I.removeOperand(1);
-  const MachineOperand &DstOp = I.getOperand(0);
-  const RegisterBank &RB = *RBI.getRegBank(DstOp.getReg(), MRI, TRI);
-  const TargetRegisterClass &RC =
-      TRI.getMinClassForRegBank(RB, MRI.getType(DstOp.getReg()));
-  return RBI.constrainGenericRegister(DstOp.getReg(), RC, MRI);
+  const Register DstReg = I.getOperand(0).getReg();
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(DstReg);
+  const TargetRegisterClass *DstRC =
+      RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+  if (!DstRC) {
+    const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+    DstRC = &TRI.getMinClassForRegBank(RB, MRI.getType(DstReg));
+  }
+  return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
 }
 
 bool AIEBaseInstructionSelector::selectG_PHI(MachineInstr &I,

@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-def GPRRegBank : RegisterBank<"GPRRegBank", [eR, eL]>;
 def PTRRegBank : RegisterBank<"PTRRegBank", [eP]>;
 def MODRegBank : RegisterBank<"MODRegBank", [mDm]>;
 def VRegBank : RegisterBank<"VRegBank", [VEC128, VEC256, VEC512, VEC1024]>;
@@ -465,6 +465,8 @@ unsigned AIE2PInstrInfo::getOpCode(MachineInstr &I) const {
       return isSigned ? AIE2P::VUNPACK_mv_unpack_x_unpackSign1
                       : AIE2P::VUNPACK_mv_unpack_x_unpackSign0;
   }
+  case Intrinsic::aie2p_vshuffle_576_bfp16:
+    return AIE2P::VSHUFFLE_vec_shuffle_ex;
   default:
     llvm_unreachable("Unexpected Intrinsic ID");
   }

@@ -107,6 +107,7 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector {
   bool selectReadTM(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectVUNPACK(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectVPACK(MachineInstr &I, MachineRegisterInfo &MRI);
+  bool selectVSHUFFLE_BFP(MachineInstr &I, MachineRegisterInfo &MRI);
 
 private:
   bool selectImpl(MachineInstr &I,
@@ -253,6 +254,8 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) {
     case Intrinsic::aie2p_v16bf16_to_v16accfloat:
     case Intrinsic::aie2p_v32bf16_to_v32accfloat:
       return selectVCONV(I, MRI);
+    case Intrinsic::aie2p_vshuffle_576_bfp16:
+      return selectVSHUFFLE_BFP(I, MRI);
     default:
       return selectImpl(I, *CoverageInfo);
     }
@@ -2910,6 +2913,58 @@ AIE2PInstructionSelector::getCombinedOpcodeSRSUPS(
   return {};
 }
 
+bool AIE2PInstructionSelector ::selectVSHUFFLE_BFP(MachineInstr &I,
+                                                   MachineRegisterInfo &MRI) {
+  Register DstMant = I.getOperand(0).getReg();
+  Register DstExp = I.getOperand(1).getReg();
+  Register Src1Mant = I.getOperand(3).getReg();
+  Register Src1Exp = I.getOperand(4).getReg();
+  Register Src2Mant = I.getOperand(5).getReg();
+  Register Src2Exp = I.getOperand(6).getReg();
+  Register Mode = I.getOperand(7).getReg();
+
+  unsigned OpCode = TII.getOpCode(I);
+  Register Src1Reg = MRI.createVirtualRegister(&AIE2P::mEXmRegClass);
+  MachineInstrBuilder RegSeq1 =
+      MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {Src1Reg}, {})
+          .addReg(Src1Mant)
+          .addImm(AIE2P::sub_bfp16_x)
+          .addReg(Src1Exp)
+          .addImm(AIE2P::sub_bfp16_e);
+  Register Src2Reg = MRI.createVirtualRegister(&AIE2P::mEXnRegClass);
+  MachineInstrBuilder RegSeq2 =
+      MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {Src2Reg}, {})
+          .addReg(Src2Mant)
+          .addImm(AIE2P::sub_bfp16_x)
+          .addReg(Src2Exp)
+          .addImm(AIE2P::sub_bfp16_e);
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *RegSeq1,
+                           AIE2P::VEC512RegClass, RegSeq1->getOperand(1));
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *RegSeq1,
+                           AIE2P::EXPVEC64RegClass, RegSeq1->getOperand(3));
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *RegSeq2,
+                           AIE2P::VEC512RegClass, RegSeq2->getOperand(1));
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *RegSeq2,
+                           AIE2P::EXPVEC64RegClass, RegSeq2->getOperand(3));
+
+  Register DstReg = MRI.createVirtualRegister(&AIE2P::mEXmRegClass);
+  MachineInstrBuilder MI =
+      MIB.buildInstr(OpCode, {DstReg}, {Src1Reg, Src2Reg, Mode});
+
+  auto MantCopyMI = MIB.buildInstr(TargetOpcode::COPY, {DstMant}, {})
+                        .addReg(DstReg, 0, AIE2P::sub_bfp16_x);
+  auto ExpCopyMI = MIB.buildInstr(TargetOpcode::COPY, {DstExp}, {})
+                       .addReg(DstReg, 0, AIE2P::sub_bfp16_e);
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *MantCopyMI,
+                           AIE2P::VEC512RegClass, MantCopyMI->getOperand(0));
+
+  constrainOperandRegClass(*MF, TRI, MRI, TII, RBI, *ExpCopyMI,
+                           AIE2P::EXPVEC64RegClass, ExpCopyMI->getOperand(0));
+
+  I.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
+}
+
 namespace llvm {
 InstructionSelector *
 createAIE2PInstructionSelector(const AIE2PTargetMachine &TM,

@@ -862,6 +862,9 @@ AIE2PRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AIE2P::eSRegClassID:
   case AIE2P::mS2RegClassID:
   case AIE2P::mS3RegClassID:
+  case AIE2P::EXPVEC64RegClassID:
+  case AIE2P::EXPVEC64_with_sub_hi_exp_in_eEheRegClassID:
+  case AIE2P::EXPVEC64_with_sub_hi_exp_in_eEhoRegClassID:
     return GPRs;
   case AIE2P::ePRegClassID:
   case AIE2P::eSpecial20RegClassID:

@@ -13,3 +13,4 @@
 
 include "AIEBaseRegisterBanks.td"
 def AccRegBank : RegisterBank<"AccRegBank", [ACC512, ACC1024, ACC2048]>;
+def GPRRegBank : RegisterBank<"GPRRegBank", [eR, eL, eE, EXPVEC64]>;
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,3 +13,4 @@

		include "AIEBaseRegisterBanks.td"
		def AccRegBank : RegisterBank<"AccRegBank", [ACC512, ACC1024, ACC2048]>;
		def GPRRegBank : RegisterBank<"GPRRegBank", [eR, eL, eE, EXPVEC64]>;
Copy link Collaborator andcarminati Jan 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Please, include a new line in the end. Copy link Collaborator khallouh Jan 18, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Should not be needed after rebase