From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Thu, 24 Oct 2024 21:56:48 +0800 Subject: [PATCH 1/6] Support AMX-FP8 Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/BuiltinsX86_64.def | 6 +++ clang/include/clang/Driver/Options.td | 2 + clang/lib/Basic/Targets/X86.cpp | 6 +++ clang/lib/Basic/Targets/X86.h | 1 + clang/lib/Headers/CMakeLists.txt | 1 + clang/lib/Headers/amxfp8intrin.h | 24 ++++++++++++ clang/lib/Headers/immintrin.h | 4 ++ clang/lib/Sema/SemaX86.cpp | 4 ++ clang/test/CodeGen/X86/amx_fp8.c | 27 +++++++++++++ clang/test/CodeGen/X86/amx_fp8_errors.c | 10 +++++ clang/test/CodeGen/X86/amx_fp8_inline_asm.c | 32 +++++++++++++++ llvm/include/llvm/IR/IntrinsicsX86.td | 17 ++++++++ .../llvm/TargetParser/X86TargetParser.def | 1 + llvm/lib/Target/X86/X86.td | 3 ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++++++++ llvm/lib/Target/X86/X86InstrAMX.td | 39 +++++++++++++++++++ llvm/lib/Target/X86/X86InstrPredicates.td | 1 + llvm/lib/TargetParser/Host.cpp | 4 ++ llvm/lib/TargetParser/X86TargetParser.cpp | 1 + llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll | 20 ++++++++++ .../Disassembler/X86/AMX/x86-64-amx-fp8.txt | 34 ++++++++++++++++ llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s | 33 ++++++++++++++++ llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s | 33 ++++++++++++++++ 24 files changed, 327 insertions(+) create mode 100644 clang/lib/Headers/amxfp8intrin.h create mode 100644 clang/test/CodeGen/X86/amx_fp8.c create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6a95337815174b..da0ab888ce200d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -642,6 +642,7 @@ X86 Support - Supported intrinsics for ``MOVRS AND AVX10.2``. * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``. +- Support ISA of ``AMX-FP8``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index e1e613560167ac..68904ae8abcd15 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") // AMX_FP16 FP16 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16") +// AMX FP8 +TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8") +TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8") + // RAO-INT TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint") TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 5df6ddd5e6a0c5..bbada0834526d7 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group; def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group; def mamx_int8 : Flag<["-"], "mamx-int8">, Group; def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group; +def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group; +def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group; def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index d067ec218b5270..b95261c39a5993 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -420,6 +420,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXTILE = true; } else if (Feature == "+amx-complex") { HasAMXCOMPLEX = true; + } else if (Feature == "+amx-fp8") { + HasAMXFP8 = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_FP16__"); if (HasAMXCOMPLEX) Builder.defineMacro("__AMX_COMPLEX__"); + if (HasAMXFP8) + Builder.defineMacro("__AMX_FP8__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1069,6 +1073,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-int8", true) .Case("amx-tile", true) + .Case("amx-fp8", true) .Case("avx", true) .Case("avx10.1-256", true) .Case("avx10.1-512", true) @@ -1187,6 +1192,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("amx-fp8", HasAMXFP8) .Case("avx", SSELevel >= AVX) .Case("avx10.1-256", HasAVX10_1) .Case("avx10.1-512", HasAVX10_1_512) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index e8aad3ec5a74b1..a1b2a0cec209ab 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -157,6 +157,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXINT8 = false; bool HasAMXBF16 = false; bool HasAMXCOMPLEX = false; + bool HasAMXFP8 = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index e97953d87a2ff9..142cd01ac5aec0 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -149,6 +149,7 @@ set(x86_files amxcomplexintrin.h amxfp16intrin.h amxintrin.h + amxfp8intrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h avx10_2_512minmaxintrin.h diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h new file mode 100644 index 00000000000000..d187b5f0421bbb --- /dev/null +++ b/clang/lib/Headers/amxfp8intrin.h @@ -0,0 +1,24 @@ +/*===---------- amxfp8intrin.h - AMX intrinsics -*- C++ -*------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXFP8INTRIN_H +#define __AMXFP8INTRIN_H +#ifdef __x86_64__ + +#define _tile_dpbf8ps __builtin_ia32_tdpbf8ps +#define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps +#define _tile_dphbf8ps __builtin_ia32_tdphbf8ps +#define _tile_dphf8ps __builtin_ia32_tdphf8ps + +#endif /* __x86_64__ */ +#endif /* __AMXFP8INTRIN_H */ diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index 5f296d0a3324d0..5529f99ab0c6b6 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -648,6 +648,10 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 6a4d78f0ca9084..0e43b030e70d41 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -640,6 +640,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpfp16ps: case X86::BI__builtin_ia32_tcmmimfp16ps: case X86::BI__builtin_ia32_tcmmrlfp16ps: + case X86::BI__builtin_ia32_tdpbf8ps: + case X86::BI__builtin_ia32_tdpbhf8ps: + case X86::BI__builtin_ia32_tdphbf8ps: + case X86::BI__builtin_ia32_tdphf8ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); } } diff --git a/clang/test/CodeGen/X86/amx_fp8.c b/clang/test/CodeGen/X86/amx_fp8.c new file mode 100644 index 00000000000000..9c79514f891299 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-fp8 \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s +#include + +void test_amx(void *data) { + //CHECK-LABEL: @test_amx + //CHECK: call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3) + _tile_dpbf8ps(1, 2, 3); +} + +void test_amx2(void *data) { + //CHECK-LABEL: @test_amx2 + //CHECK: call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3) + _tile_dpbhf8ps(1, 2, 3); +} + +void test_amx3(void *data) { + //CHECK-LABEL: @test_amx3 + //CHECK: call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3) + _tile_dphbf8ps(1, 2, 3); +} + +void test_amx4(void *data) { + //CHECK-LABEL: @test_amx4 + //CHECK: call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3) + _tile_dphf8ps(1, 2, 3); +} diff --git a/clang/test/CodeGen/X86/amx_fp8_errors.c b/clang/test/CodeGen/X86/amx_fp8_errors.c new file mode 100644 index 00000000000000..77cbd34905b8ba --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8_errors.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-fp8 -verify + +#include + +void test_amx(void *data) { + _tile_dpbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dpbhf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dphbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dphf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} +} diff --git a/clang/test/CodeGen/X86/amx_fp8_inline_asm.c b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c new file mode 100644 index 00000000000000..49331bd9d368ab --- /dev/null +++ b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-fp8 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s + +void f_tilemul(short a) +{ + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdpbf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbhf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdpbhf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdphbf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdphbf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdphf8ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdphf8ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index d0083017fb9383..6530051e0d3e9f 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5994,6 +5994,23 @@ let TargetPrefix = "x86" in { [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; + + def int_x86_tdpbf8ps : ClangBuiltin<"__builtin_ia32_tdpbf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdpbhf8ps : ClangBuiltin<"__builtin_ia32_tdpbhf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdphbf8ps : ClangBuiltin<"__builtin_ia32_tdphbf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; + def int_x86_tdphf8ps : ClangBuiltin<"__builtin_ia32_tdphf8ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, + ImmArg>, ImmArg>]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 073e19f8187c65..19e8e0013ef6a0 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -264,6 +264,7 @@ X86_FEATURE_COMPAT(AVX10_2_512, "avx10.2-512", 0) //FIXME: make MOVRS _COMPAT defined when gcc landed relate patch. X86_FEATURE (MOVRS, "movrs") X86_FEATURE (ZU, "zu") +X86_FEATURE (AMX_FP8, "amx-fp8") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 6bedf9e1d13ac3..79c60402a49f0a 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -270,6 +270,9 @@ def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true", def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", "Support AMX-COMPLEX instructions", [FeatureAMXTILE]>; +def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", + "Support AMX-FP8 instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a6d77873ec2901..5a7313ac3e1234 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37503,6 +37503,29 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTDPBF8PS: + case X86::PTDPBHF8PS: + case X86::PTDPHBF8PS: + case X86::PTDPHF8PS: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch(MI.getOpcode()) { + default: llvm_unreachable("Unexpected instruction!"); + case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); + + MI.eraseFromParent(); + return BB; + } } } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 99deacc811a170..d0c91ab7b5e696 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -267,3 +267,42 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in { } } // SchedRW = [WriteSystem] } + +// AMX-FP8 +let Predicates = [HasAMXFP8, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + class AMX_FP8_BASE Opcode, string Opstr> : + I, VEX, VVVV; + } + + def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS; + def TDPBHF8PS : AMX_FP8_BASE<0xfd, "tdpbhf8ps">, T_MAP5, XD; + def TDPHBF8PS : AMX_FP8_BASE<0xfd, "tdphbf8ps">, T_MAP5, XS; + def TDPHF8PS : AMX_FP8_BASE<0xfd, "tdphf8ps">, T_MAP5, PD; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBF8PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbf8ps timm:$src1, + timm:$src2, timm:$src3)]>; + def PTDPBHF8PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbhf8ps timm:$src1, + timm:$src2, timm:$src3)]>; + def PTDPHBF8PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdphbf8ps timm:$src1, + timm:$src2, timm:$src3)]>; + def PTDPHF8PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdphf8ps timm:$src1, + timm:$src2, timm:$src3)]>; + } + } +} diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 7fb566fba51818..5b659d3b072dca 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -183,6 +183,7 @@ def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; +def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 5c4e3a9dc52b0f..78991b3936505c 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1876,6 +1876,10 @@ const StringMap sys::getHostCPUFeatures() { MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); + bool HasLeaf1E = + MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX); + Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; + bool HasLeaf24 = MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX); diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 586df5748aa822..7d60b81d4bb1c3 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -598,6 +598,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll new file mode 100644 index 00000000000000..f5d3f6ec9ec298 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-fp8 | FileCheck %s + +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK: tdpbf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdpbhf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdphbf8ps %tmm3, %tmm2, %tmm1 +; CHECK: tdphf8ps %tmm3, %tmm2, %tmm1 + +define void @test_amx(){ +call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3) +call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3) +ret void +} +declare void @llvm.x86.tdpbf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdpbhf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdphbf8ps(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdphf8ps(i8 %tile0, i8 %tile1, i8 %tile2) diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt new file mode 100644 index 00000000000000..e714a52d2c31a7 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: tdpbf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdpbf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x58,0xfd,0xf5 + +# ATT: tdpbf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdpbf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x70,0xfd,0xda + +# ATT: tdpbhf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdpbhf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x5b,0xfd,0xf5 + +# ATT: tdpbhf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdpbhf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x73,0xfd,0xda + +# ATT: tdphbf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdphbf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x5a,0xfd,0xf5 + +# ATT: tdphbf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdphbf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x72,0xfd,0xda + +# ATT: tdphf8ps %tmm4, %tmm5, %tmm6 +# INTEL: tdphf8ps tmm6, tmm5, tmm4 +0xc4,0xe5,0x59,0xfd,0xf5 + +# ATT: tdphf8ps %tmm1, %tmm2, %tmm3 +# INTEL: tdphf8ps tmm3, tmm2, tmm1 +0xc4,0xe5,0x71,0xfd,0xda diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s new file mode 100644 index 00000000000000..904539ec4917fe --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s + +// CHECK: tdpbf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5] + tdpbf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdpbf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda] + tdpbf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdpbhf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5] + tdpbhf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdpbhf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda] + tdpbhf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdphbf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5] + tdphbf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdphbf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda] + tdphbf8ps %tmm1, %tmm2, %tmm3 + +// CHECK: tdphf8ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5] + tdphf8ps %tmm4, %tmm5, %tmm6 + +// CHECK: tdphf8ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda] + tdphf8ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s new file mode 100644 index 00000000000000..4191ae6f5cd133 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s @@ -0,0 +1,33 @@ +// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tdpbf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5] + tdpbf8ps tmm6, tmm5, tmm4 + +// CHECK: tdpbf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda] + tdpbf8ps tmm3, tmm2, tmm1 + +// CHECK: tdpbhf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5] + tdpbhf8ps tmm6, tmm5, tmm4 + +// CHECK: tdpbhf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda] + tdpbhf8ps tmm3, tmm2, tmm1 + +// CHECK: tdphbf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5] + tdphbf8ps tmm6, tmm5, tmm4 + +// CHECK: tdphbf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda] + tdphbf8ps tmm3, tmm2, tmm1 + +// CHECK: tdphf8ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5] + tdphf8ps tmm6, tmm5, tmm4 + +// CHECK: tdphf8ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda] + tdphf8ps tmm3, tmm2, tmm1 From e960360797ec027b6de3e19dcf26d83e75365e11 Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Mon, 28 Oct 2024 10:12:16 +0800 Subject: [PATCH 2/6] Apply clang format --- llvm/lib/Target/X86/X86ISelLowering.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5a7313ac3e1234..a83f3925813daf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37509,12 +37509,21 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPHF8PS: { const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc; - switch(MI.getOpcode()) { - default: llvm_unreachable("Unexpected instruction!"); - case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; - case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; - case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; - case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unexpected instruction!"); + case X86::PTDPBF8PS: + Opc = X86::TDPBF8PS; + break; + case X86::PTDPBHF8PS: + Opc = X86::TDPBHF8PS; + break; + case X86::PTDPHBF8PS: + Opc = X86::TDPHBF8PS; + break; + case X86::PTDPHF8PS: + Opc = X86::TDPHF8PS; + break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); From e28c3230bbb34e27a85de2496db6b5a873eb526a Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Wed, 30 Oct 2024 09:53:20 +0800 Subject: [PATCH 3/6] Address comments. --- clang/lib/Headers/amxfp8intrin.h | 61 ++++++++++++++++++++++++- llvm/lib/Target/X86/X86.td | 4 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 42 ++++------------- llvm/lib/Target/X86/X86InstrAMX.td | 38 +++++++-------- 4 files changed, 90 insertions(+), 55 deletions(-) diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h index d187b5f0421bbb..b6e6a105bc42f3 100644 --- a/clang/lib/Headers/amxfp8intrin.h +++ b/clang/lib/Headers/amxfp8intrin.h @@ -1,4 +1,4 @@ -/*===---------- amxfp8intrin.h - AMX intrinsics -*- C++ -*------------=== +/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------=== * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. @@ -15,9 +15,68 @@ #define __AMXFP8INTRIN_H #ifdef __x86_64__ + +/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8) +/// floating-point pairs in tiles \a a and \a b, accumulating the +/// intermediate single-precision (32-bit) floating-point elements with +/// elements in \a dst, and store the 32-bit result back to tile \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbf8ps __builtin_ia32_tdpbf8ps + +/// \code +/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPBHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps + +/// \code +/// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHBF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dphbf8ps __builtin_ia32_tdphbf8ps + +/// \code +/// void _tile_dphf8ps (__tile dst, __tile a, __tile b) +/// \endcode +/// +/// This intrinsic corresponds to the \c TDPHF8PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dphf8ps __builtin_ia32_tdphf8ps #endif /* __x86_64__ */ diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 79c60402a49f0a..c7882acc044e04 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -271,8 +271,8 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", "Support AMX-COMPLEX instructions", [FeatureAMXTILE]>; def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true", - "Support AMX-FP8 instructions", - [FeatureAMXTILE]>; + "Support AMX-FP8 instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a83f3925813daf..2f2987eec24e62 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37410,7 +37410,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBUSD: case X86::PTDPBUUD: case X86::PTDPBF16PS: - case X86::PTDPFP16PS: { + case X86::PTDPFP16PS: + case X86::PTDPBF8PS: + case X86::PTDPBHF8PS: + case X86::PTDPHBF8PS: + case X86::PTDPHF8PS: { unsigned Opc; switch (MI.getOpcode()) { // clang-format off @@ -37421,6 +37425,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; + case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break; + case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; + case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; + case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; // clang-format on } @@ -37503,38 +37511,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.eraseFromParent(); // The pseudo is gone now. return BB; } - case X86::PTDPBF8PS: - case X86::PTDPBHF8PS: - case X86::PTDPHBF8PS: - case X86::PTDPHF8PS: { - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Opc; - switch (MI.getOpcode()) { - default: - llvm_unreachable("Unexpected instruction!"); - case X86::PTDPBF8PS: - Opc = X86::TDPBF8PS; - break; - case X86::PTDPBHF8PS: - Opc = X86::TDPBHF8PS; - break; - case X86::PTDPHBF8PS: - Opc = X86::TDPHBF8PS; - break; - case X86::PTDPHF8PS: - Opc = X86::TDPHF8PS; - break; - } - - MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); - MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); - - MI.eraseFromParent(); - return BB; - } } } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index d0c91ab7b5e696..202232ccb8bc72 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -274,9 +274,9 @@ let Predicates = [HasAMXFP8, In64BitMode] in { let Constraints = "$src1 = $dst" in { class AMX_FP8_BASE Opcode, string Opstr> : I, VEX, VVVV; + (ins TILE:$src1, TILE:$src2, TILE:$src3), + !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, VEX, VVVV; } def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS; @@ -287,22 +287,22 @@ let Predicates = [HasAMXFP8, In64BitMode] in { let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp - def PTDPBF8PS : PseudoI<(outs), (ins u8imm:$src1, - u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbf8ps timm:$src1, - timm:$src2, timm:$src3)]>; - def PTDPBHF8PS : PseudoI<(outs), (ins u8imm:$src1, - u8imm:$src2, u8imm:$src3), - [(int_x86_tdpbhf8ps timm:$src1, - timm:$src2, timm:$src3)]>; - def PTDPHBF8PS : PseudoI<(outs), (ins u8imm:$src1, - u8imm:$src2, u8imm:$src3), - [(int_x86_tdphbf8ps timm:$src1, - timm:$src2, timm:$src3)]>; - def PTDPHF8PS : PseudoI<(outs), (ins u8imm:$src1, - u8imm:$src2, u8imm:$src3), - [(int_x86_tdphf8ps timm:$src1, - timm:$src2, timm:$src3)]>; + def PTDPBF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPBHF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbhf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPHBF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdphbf8ps timm:$src1, timm:$src2, + timm:$src3)]>; + def PTDPHF8PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tdphf8ps timm:$src1, timm:$src2, + timm:$src3)]>; } } } From b9a8b40364a7285c8e4acc79b3adbdeeddb515fa Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Wed, 30 Oct 2024 10:01:14 +0800 Subject: [PATCH 4/6] Address more comments. --- llvm/lib/TargetParser/Host.cpp | 4 ++-- .../Disassembler/X86/AMX/{x86-64-amx-fp8.txt => amx-fp8.txt} | 0 llvm/test/MC/X86/AMX/{x86-64-amx-fp8-att.s => amx-fp8-att.s} | 0 .../MC/X86/AMX/{x86-64-amx-fp8-intel.s => amx-fp8-intel.s} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename llvm/test/MC/Disassembler/X86/AMX/{x86-64-amx-fp8.txt => amx-fp8.txt} (100%) rename llvm/test/MC/X86/AMX/{x86-64-amx-fp8-att.s => amx-fp8-att.s} (100%) rename llvm/test/MC/X86/AMX/{x86-64-amx-fp8-intel.s => amx-fp8-intel.s} (100%) diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 78991b3936505c..fd34a276cf3ce5 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1876,8 +1876,8 @@ const StringMap sys::getHostCPUFeatures() { MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); - bool HasLeaf1E = - MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX); + bool HasLeaf1E = MaxLevel >= 0x1e && + !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; bool HasLeaf24 = diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt b/llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt similarity index 100% rename from llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt rename to llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s b/llvm/test/MC/X86/AMX/amx-fp8-att.s similarity index 100% rename from llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s rename to llvm/test/MC/X86/AMX/amx-fp8-att.s diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s b/llvm/test/MC/X86/AMX/amx-fp8-intel.s similarity index 100% rename from llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s rename to llvm/test/MC/X86/AMX/amx-fp8-intel.s From 4ce35440ca0436c3bf5619dd7c0646418a78d977 Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Wed, 30 Oct 2024 10:32:51 +0800 Subject: [PATCH 5/6] Update intrinsics comments. --- clang/lib/Headers/amxfp8intrin.h | 35 ++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h index b6e6a105bc42f3..8ff4ddf2acc446 100644 --- a/clang/lib/Headers/amxfp8intrin.h +++ b/clang/lib/Headers/amxfp8intrin.h @@ -15,11 +15,12 @@ #define __AMXFP8INTRIN_H #ifdef __x86_64__ - -/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8) -/// floating-point pairs in tiles \a a and \a b, accumulating the -/// intermediate single-precision (32-bit) floating-point elements with -/// elements in \a dst, and store the 32-bit result back to tile \a dst. +/// These instructions compute dot product of brain-float8 (BF8) or +/// hybrid-float8 (HF8) accumulating into a single precision (FP32). The input +/// elements can be BF8 or HF8. These instructions have three tile operands, one +/// source/dest accumulator operand, and two source operands, \a a and \a b. The +/// \a a and \a b operands can be BF8 or HF8 independently, and the source/dest +/// operand, \a dst is always FP32. /// /// \headerfile /// @@ -27,7 +28,9 @@ /// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPBF8PS instruction. +/// This intrinsic corresponds to the \c TDPBF8PS instruction, which is the dot +/// product of a BF8 value (\a a) by a BF8 value (\a b) accumulating into a +/// Single Precision (FP32) source/dest (\a dst). /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -35,13 +38,15 @@ /// The 1st source tile. Max size is 1024 Bytes. /// \param b /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbf8ps __builtin_ia32_tdpbf8ps +#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b)) /// \code /// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPBHF8PS instruction. +/// This intrinsic corresponds to the \c TDPBHF8PS instruction, which is the dot +/// product of a BF8 value (\a a) by an HF8 value (\a b) accumulating into a +/// Single Precision (FP32) source/dest (\a dst). /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -49,13 +54,15 @@ /// The 1st source tile. Max size is 1024 Bytes. /// \param b /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps +#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b)) /// \code /// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPHBF8PS instruction. +/// This intrinsic corresponds to the \c TDPHBF8PS instruction, which is the dot +/// product of an HF8 value (\a a) by a BF8 value (\a b) accumulating into a +/// Single Precision (FP32) source/dest (\a dst). /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -63,13 +70,15 @@ /// The 1st source tile. Max size is 1024 Bytes. /// \param b /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dphbf8ps __builtin_ia32_tdphbf8ps +#define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b)) /// \code /// void _tile_dphf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPHF8PS instruction. +/// This intrinsic corresponds to the \c TDPHF8PS instruction, which is the dot +/// product of an HF8 value (\a a) by an HF8 value (\a b) accumulating into a +/// Single Precision (FP32) source/dest (\a dst). /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -77,7 +86,7 @@ /// The 1st source tile. Max size is 1024 Bytes. /// \param b /// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dphf8ps __builtin_ia32_tdphf8ps +#define _tile_dphf8ps(dst, a, b) __builtin_ia32_tdphf8ps((dst), (a), (b)) #endif /* __x86_64__ */ #endif /* __AMXFP8INTRIN_H */ From 4ada168d0ad18d144ef05ed76136797c4ea6edab Mon Sep 17 00:00:00 2001 From: Feng Zou Date: Wed, 30 Oct 2024 11:39:03 +0800 Subject: [PATCH 6/6] Update intrinsic description. --- clang/lib/Headers/amxfp8intrin.h | 39 +++++++++++++++++--------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h index 8ff4ddf2acc446..0f5ddc87e5a752 100644 --- a/clang/lib/Headers/amxfp8intrin.h +++ b/clang/lib/Headers/amxfp8intrin.h @@ -15,12 +15,8 @@ #define __AMXFP8INTRIN_H #ifdef __x86_64__ -/// These instructions compute dot product of brain-float8 (BF8) or -/// hybrid-float8 (HF8) accumulating into a single precision (FP32). The input -/// elements can be BF8 or HF8. These instructions have three tile operands, one -/// source/dest accumulator operand, and two source operands, \a a and \a b. The -/// \a a and \a b operands can be BF8 or HF8 independently, and the source/dest -/// operand, \a dst is always FP32. +/// Peform the dot product of a BF8 value \a a by a BF8 value \a b accumulating +/// into a Single Precision (FP32) source/dest \a dst. /// /// \headerfile /// @@ -28,9 +24,7 @@ /// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPBF8PS instruction, which is the dot -/// product of a BF8 value (\a a) by a BF8 value (\a b) accumulating into a -/// Single Precision (FP32) source/dest (\a dst). +/// This intrinsic corresponds to the \c TDPBF8PS instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -40,13 +34,16 @@ /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b)) +/// Perform the dot product of a BF8 value \a a by an HF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// /// \code /// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPBHF8PS instruction, which is the dot -/// product of a BF8 value (\a a) by an HF8 value (\a b) accumulating into a -/// Single Precision (FP32) source/dest (\a dst). +/// This intrinsic corresponds to the \c TDPBHF8PS instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -56,13 +53,16 @@ /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b)) +/// Perform the dot product of an HF8 value \a a by a BF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// /// \code /// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPHBF8PS instruction, which is the dot -/// product of an HF8 value (\a a) by a BF8 value (\a b) accumulating into a -/// Single Precision (FP32) source/dest (\a dst). +/// This intrinsic corresponds to the \c TDPHBF8PS instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. @@ -72,13 +72,16 @@ /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b)) +/// Perform the dot product of an HF8 value \a a by an HF8 value \a b +/// accumulating into a Single Precision (FP32) source/dest \a dst. +/// +/// \headerfile +/// /// \code /// void _tile_dphf8ps (__tile dst, __tile a, __tile b) /// \endcode /// -/// This intrinsic corresponds to the \c TDPHF8PS instruction, which is the dot -/// product of an HF8 value (\a a) by an HF8 value (\a b) accumulating into a -/// Single Precision (FP32) source/dest (\a dst). +/// This intrinsic corresponds to the \c TDPHF8PS instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes.