[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660

phoebewang · 2024-11-10T14:48:49Z

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

llvmbot · 2024-11-10T14:49:19Z

@llvm/pr-subscribers-mc
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff

22 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86_64.def (+12)
(modified) clang/lib/Headers/CMakeLists.txt (+3)
(added) clang/lib/Headers/amxbf16transposeintrin.h (+94)
(added) clang/lib/Headers/amxcomplextransposeintrin.h (+301)
(modified) clang/lib/Headers/amxfp16intrin.h (+35)
(added) clang/lib/Headers/amxfp16transposeintrin.h (+94)
(modified) clang/lib/Headers/amxintrin.h (-32)
(modified) clang/lib/Headers/immintrin.h (+19-3)
(modified) clang/lib/Sema/SemaX86.cpp (+6)
(modified) clang/test/CodeGen/X86/amx_transpose.c (+39)
(modified) clang/test/CodeGen/X86/amx_transpose_api.c (+49-1)
(modified) clang/test/CodeGen/X86/amx_transpose_errors.c (+47-3)
(modified) llvm/include/llvm/IR/IntrinsicsX86.td (+57)
(modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+25-3)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34-24)
(modified) llvm/lib/Target/X86/X86InstrAMX.td (+89)
(modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+23-1)
(modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+7-1)
(modified) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+76-1)
(modified) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+48)
(modified) llvm/test/MC/X86/amx-transpose-att.s (+48)
(modified) llvm/test/MC/X86/amx-transpose-intel.s (+48)

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
 
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
   adxintrin.h
   ammintrin.h
   amxavx512intrin.h
+  amxbf16transposeintrin.h
   amxcomplexintrin.h
+  amxcomplextransposeintrin.h
   amxfp16intrin.h
+  amxfp16transposeintrin.h
   amxfp8intrin.h
   amxintrin.h
   amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles \a a and \a b, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in \a dst, and store the
+///    32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+///					FP32(b.row[k].bf16[2*n+0])
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+///					FP32(b.row[k].bf16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles src0 and src1, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in "dst", and store the
+///    32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                        __tile1024i src1) {
+  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                       src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The imaginary part of the \a a element
+///    is multiplied with the real part of the corresponding \a b element, and
+///    the real part of the \a a element is multiplied with the imaginary part
+///    of the corresponding \a b elements. The two accumulated results are
+///    added, and then accumulated into the corresponding row and column of
+///    \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (rtransposed colum of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The real part of the \a a element is
+///    multiplied with the real part of the corresponding \a b element, and the
+///    negated imaginary part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles \a a and \a b
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The negated imaginary part of the \a a
+///    element is multiplied with the real part of the corresponding \a b
+///    element, and the real part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
+  __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+///    and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+///	FOR j := 0 TO (dst.colsb / 4) - 1
+///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+///	ENDFOR
+///	write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+    unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles src0 and src1
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                    __tile1024i src1) {
+  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                              dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+///    writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
 #define __AMX_FP16INTRIN_H
 #ifdef __x86_64__
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
 ///    and \a b, accumulating the intermediate single-precision (32-bit)
 ///    floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
 #define _tile_dpfp16ps(dst, a, b)                                \
   __builtin_ia32_tdpfp16ps(dst, a, b)
 
+/// This is internal intrinsic. C/C++ user should avoid calli...
[truncated]

llvmbot · 2024-11-10T14:49:20Z

@llvm/pr-subscribers-llvm-ir

Author: Phoebe Wang (phoebewang)

Changes

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff

22 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86_64.def (+12)
(modified) clang/lib/Headers/CMakeLists.txt (+3)
(added) clang/lib/Headers/amxbf16transposeintrin.h (+94)
(added) clang/lib/Headers/amxcomplextransposeintrin.h (+301)
(modified) clang/lib/Headers/amxfp16intrin.h (+35)
(added) clang/lib/Headers/amxfp16transposeintrin.h (+94)
(modified) clang/lib/Headers/amxintrin.h (-32)
(modified) clang/lib/Headers/immintrin.h (+19-3)
(modified) clang/lib/Sema/SemaX86.cpp (+6)
(modified) clang/test/CodeGen/X86/amx_transpose.c (+39)
(modified) clang/test/CodeGen/X86/amx_transpose_api.c (+49-1)
(modified) clang/test/CodeGen/X86/amx_transpose_errors.c (+47-3)
(modified) llvm/include/llvm/IR/IntrinsicsX86.td (+57)
(modified) llvm/lib/Target/X86/X86ExpandPseudo.cpp (+25-3)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+34-24)
(modified) llvm/lib/Target/X86/X86InstrAMX.td (+89)
(modified) llvm/lib/Target/X86/X86LowerAMXType.cpp (+23-1)
(modified) llvm/lib/Target/X86/X86RegisterInfo.cpp (+7-1)
(modified) llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll (+76-1)
(modified) llvm/test/MC/Disassembler/X86/amx-transpose-att.txt (+48)
(modified) llvm/test/MC/X86/amx-transpose-att.s (+48)
(modified) llvm/test/MC/X86/amx-transpose-intel.s (+48)

diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
 TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
 
 TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
   adxintrin.h
   ammintrin.h
   amxavx512intrin.h
+  amxbf16transposeintrin.h
   amxcomplexintrin.h
+  amxcomplextransposeintrin.h
   amxfp16intrin.h
+  amxfp16transposeintrin.h
   amxfp8intrin.h
   amxintrin.h
   amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles \a a and \a b, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in \a dst, and store the
+///    32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+///					FP32(b.row[k].bf16[2*n+0])
+///			tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+///					FP32(b.row[k].bf16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+///    tiles src0 and src1, accumulating the intermediate single-precision
+///    (32-bit) floating-point elements with elements in "dst", and store the
+///    32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                        __tile1024i src1) {
+  dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                       src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The imaginary part of the \a a element
+///    is multiplied with the real part of the corresponding \a b element, and
+///    the real part of the \a a element is multiplied with the imaginary part
+///    of the corresponding \a b elements. The two accumulated results are
+///    added, and then accumulated into the corresponding row and column of
+///    \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (rtransposed colum of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The real part of the \a a element is
+///    multiplied with the real part of the corresponding \a b element, and the
+///    negated imaginary part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles \a a and \a b
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (transposed column of \a a, column of \a b), it performs a set of
+///    multiplication and accumulations on all corresponding complex numbers
+///    (one from \a a and one from \a b). The negated imaginary part of the \a a
+///    element is multiplied with the real part of the corresponding \a b
+///    element, and the real part of the \a a element is multiplied with the
+///    imaginary part of the corresponding \a b elements. The two accumulated
+///    results are added, and then accumulated into the corresponding row and
+///    column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO a.rows - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
+  __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+///    and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+///	FOR j := 0 TO (dst.colsb / 4) - 1
+///		tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+///		tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+///	ENDFOR
+///	write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+    unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+    _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+    unsigned short m, unsigned short n, _tile1024i src) {
+  return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles src0 and src1 is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+///    This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                                __tile1024i src1) {
+  dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                          dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+///    containing complex elements and accumulate the results into a packed
+///    single precision tile. Each dword element in input tiles src0 and src1
+///    is interpreted as a complex number with FP16 real part and FP16 imaginary
+///    part.
+///    This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                                    __tile1024i src1) {
+  dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                              dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+///    writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src
+///    The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+  dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
 #define __AMX_FP16INTRIN_H
 #ifdef __x86_64__
 
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
 ///    and \a b, accumulating the intermediate single-precision (32-bit)
 ///    floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
 #define _tile_dpfp16ps(dst, a, b)                                \
   __builtin_ia32_tdpfp16ps(dst, a, b)
 
+/// This is internal intrinsic. C/C++ user should avoid calli...
[truncated]

github-actions · 2024-11-10T14:52:00Z

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:

git-clang-format --diff 01d233ff403823389f8480897e41aea84ecbb3d3 a8a00400bd5cf0b8a2d114708b20ffabd5b9bb9e --extensions c,cpp,h -- clang/lib/Headers/amxbf16transposeintrin.h clang/lib/Headers/amxcomplextransposeintrin.h clang/lib/Headers/amxfp16transposeintrin.h clang/lib/Headers/amxfp16intrin.h clang/lib/Headers/amxintrin.h clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp clang/test/CodeGen/X86/amx_transpose.c clang/test/CodeGen/X86/amx_transpose_api.c clang/test/CodeGen/X86/amx_transpose_errors.c llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp

View the diff from clang-format here.

diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
index 11abaf98e9..84883fdaee 100644
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -45,11 +45,10 @@
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO a.rows - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+1]) 			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+0]) 		ENDFOR 	ENDFOR 	write_row_and_zero(dst, m, tmp,
+///dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()
@@ -91,11 +90,10 @@
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO a.rows - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+0]) 			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+1]) 		ENDFOR 	ENDFOR 	write_row_and_zero(dst, m, tmp,
+///dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()
@@ -138,11 +136,10 @@
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO a.rows - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
-///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-///		ENDFOR
-///	ENDFOR
-///	write_row_and_zero(dst, m, tmp, dst.colsb)
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+1]) 			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+0]) 		ENDFOR 	ENDFOR 	write_row_and_zero(dst, m, tmp,
+///dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()

fzou1 · 2024-11-11T15:27:26Z

llvm/lib/Target/X86/X86LowerAMXType.cpp

+  case Intrinsic::x86_ttdpfp16ps_internal:
+  case Intrinsic::x86_ttcmmimfp16ps_internal:
+  case Intrinsic::x86_ttcmmrlfp16ps_internal:
+  case Intrinsic::x86_tconjtcmmimfp16ps_internal: {


You may rebase the branch and merge the code with AMX-TF32 intrinsic (https://github.com/llvm/llvm-project/pull/115625/files#diff-69bcebccbbe093fbfcf4393d7115a823206ee71a8b98b78d83d12642e1afcc5aR279)

fzou1 · 2024-11-13T02:07:06Z

clang/lib/Headers/amxbf16transposeintrin.h

+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)


__builtin_ia32_ttdpbf16ps(dst, a, b) -> __builtin_ia32_ttdpbf16ps((dst), (a), (b))

fzou1 · 2024-11-13T02:10:33Z

clang/lib/Headers/amxcomplextransposeintrin.h

+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)


fzou1 · 2024-11-13T02:10:41Z

clang/lib/Headers/amxcomplextransposeintrin.h

+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)


fzou1 · 2024-11-13T02:10:50Z

clang/lib/Headers/amxcomplextransposeintrin.h

+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b)                                      \
+  __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)


fzou1 · 2024-11-13T02:10:57Z

clang/lib/Headers/amxcomplextransposeintrin.h

+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)


fzou1 · 2024-11-13T02:11:13Z

clang/lib/Headers/amxfp16transposeintrin.h

+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps(dst, a, b)


fzou1

LGTM

[X86][AMX] Support AMX-TRANSPOSE, part 2

f2fc493

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368

phoebewang requested review from RKSimon and xiangzh1 November 10, 2024 14:48

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics mc Machine (object) code llvm:ir labels Nov 10, 2024

fzou1 reviewed Nov 12, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into AMX-TRANSPOSE

96146a0

fzou1 reviewed Nov 13, 2024

View reviewed changes

Address review comments

a8a0040

fzou1 approved these changes Nov 14, 2024

View reviewed changes

phoebewang merged commit 813f7c3 into llvm:main Nov 14, 2024
7 of 8 checks passed

phoebewang deleted the AMX-TRANSPOSE branch November 14, 2024 05:51

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660

[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660

phoebewang commented Nov 10, 2024

llvmbot commented Nov 10, 2024 •

edited

Loading

llvmbot commented Nov 10, 2024

github-actions bot commented Nov 10, 2024 •

edited

Loading

fzou1 Nov 11, 2024

phoebewang Nov 13, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 Nov 13, 2024

phoebewang Nov 14, 2024

fzou1 left a comment

[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660

[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660

Conversation

phoebewang commented Nov 10, 2024

llvmbot commented Nov 10, 2024 • edited Loading

llvmbot commented Nov 10, 2024

github-actions bot commented Nov 10, 2024 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

fzou1 left a comment

Choose a reason for hiding this comment

llvmbot commented Nov 10, 2024 •

edited

Loading

github-actions bot commented Nov 10, 2024 •

edited

Loading