-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86][AMX] Support AMX-TRANSPOSE, part 2 #115660
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/671368 Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff 22 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
adxintrin.h
ammintrin.h
amxavx512intrin.h
+ amxbf16transposeintrin.h
amxcomplexintrin.h
+ amxcomplextransposeintrin.h
amxfp16intrin.h
+ amxfp16transposeintrin.h
amxfp8intrin.h
amxintrin.h
amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles \a a and \a b, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in \a dst, and store the
+/// 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO (a.colsb / 4) - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+/// FP32(b.row[k].bf16[2*n+0])
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+/// FP32(b.row[k].bf16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles src0 and src1, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in "dst", and store the
+/// 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The imaginary part of the \a a element
+/// is multiplied with the real part of the corresponding \a b element, and
+/// the real part of the \a a element is multiplied with the imaginary part
+/// of the corresponding \a b elements. The two accumulated results are
+/// added, and then accumulated into the corresponding row and column of
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+/// of (rtransposed colum of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The real part of the \a a element is
+/// multiplied with the real part of the corresponding \a b element, and the
+/// negated imaginary part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles \a a and \a b
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The negated imaginary part of the \a a
+/// element is multiplied with the real part of the corresponding \a b
+/// element, and the real part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b) \
+ __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+/// and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+/// FOR j := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+/// ENDFOR
+/// write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+ unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles src0 and src1
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+/// writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+ dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
+/// This is internal intrinsic. C/C++ user should avoid calli...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: Phoebe Wang (phoebewang) ChangesRef.: https://cdrdv2.intel.com/v1/dl/getContent/671368 Patch is 68.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115660.diff 22 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 9f7462b1e0d962..cc8637ed9c50da 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -133,6 +133,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1_internal, "vUsUsUsV256i*V256i*vC*z",
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1_internal, "vUsUsUsV256i*V256i*vC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed_internal, "V256iUsUsV256i", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16_internal, "V256iUsUsV256i", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps_internal, "V16fUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", "amx-avx512,avx10.2-512")
@@ -164,6 +170,12 @@ TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz0t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1, "vIUcvC*z", "n", "amx-transpose")
TARGET_BUILTIN(__builtin_ia32_t2rpntlvwz1t1, "vIUcvC*z", "n","amx-transpose")
TARGET_BUILTIN(__builtin_ia32_ttransposed, "vIUcIUc", "n", "amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_ttcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex,amx-transpose")
+TARGET_BUILTIN(__builtin_ia32_tconjtfp16, "vIUcIUc", "n", "amx-complex,amx-transpose")
TARGET_BUILTIN(__builtin_ia32_tcvtrowd2ps, "V16fIUcUi", "n", "amx-avx512,avx10.2-512")
TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16h, "V32yIUcUi", "n", "amx-avx512,avx10.2-512")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 76366ca1f108e9..19013d37f46ef7 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,11 @@ set(x86_files
adxintrin.h
ammintrin.h
amxavx512intrin.h
+ amxbf16transposeintrin.h
amxcomplexintrin.h
+ amxcomplextransposeintrin.h
amxfp16intrin.h
+ amxfp16transposeintrin.h
amxfp8intrin.h
amxintrin.h
amxtransposeintrin.h
diff --git a/clang/lib/Headers/amxbf16transposeintrin.h b/clang/lib/Headers/amxbf16transposeintrin.h
new file mode 100644
index 00000000000000..7d31384e317988
--- /dev/null
+++ b/clang/lib/Headers/amxbf16transposeintrin.h
@@ -0,0 +1,94 @@
+/*===----- amxbf16transposeintrin.h - AMX-BF16 and AMX-TRANSPOSE ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxbf16transposeintrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_BF16TRANSPOSEINTRIN_H
+#define __AMX_BF16TRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-bf16,amx-transpose")))
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles \a a and \a b, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in \a dst, and store the
+/// 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_tdpbf16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO (a.colsb / 4) - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+0]) *
+/// FP32(b.row[k].bf16[2*n+0])
+/// tmp.bf32[n] += FP32(a.row[m].bf16[2*k+1]) *
+/// FP32(b.row[k].bf16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTDPBF16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b)
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS
+_tile_tdpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Compute transpose and dot-product of BF16 (16-bit) floating-point pairs in
+/// tiles src0 and src1, accumulating the intermediate single-precision
+/// (32-bit) floating-point elements with elements in "dst", and store the
+/// 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static __inline__ void __tile_tdpbf16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tdpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __x86_64__ */
+#endif /* __AMX_BF16TRANSPOSEINTRIN_H */
diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
new file mode 100644
index 00000000000000..06fb53e4deadcd
--- /dev/null
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -0,0 +1,301 @@
+/*===----- amxcomplextransposeintrin.h - AMX-COMPLEX and AMX-TRANSPOSE ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <amxcomplextransposeintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXTRANSPOSEINTRIN_H
+#define __AMX_COMPLEXTRANSPOSEINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-complex,amx-transpose")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The imaginary part of the \a a element
+/// is multiplied with the real part of the corresponding \a b element, and
+/// the real part of the \a a element is multiplied with the imaginary part
+/// of the corresponding \a b elements. The two accumulated results are
+/// added, and then accumulated into the corresponding row and column of
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles \a a and \a b is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+/// of (rtransposed colum of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The real part of the \a a element is
+/// multiplied with the real part of the corresponding \a b element, and the
+/// negated imaginary part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_tcmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b)
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles \a a and \a b
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// Calculates the imaginary part of the result. For each possible combination
+/// of (transposed column of \a a, column of \a b), it performs a set of
+/// multiplication and accumulations on all corresponding complex numbers
+/// (one from \a a and one from \a b). The negated imaginary part of the \a a
+/// element is multiplied with the real part of the corresponding \a b
+/// element, and the real part of the \a a element is multiplied with the
+/// imaginary part of the corresponding \a b elements. The two accumulated
+/// results are added, and then accumulated into the corresponding row and
+/// column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtcmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+/// tmp := dst.row[m]
+/// FOR k := 0 TO a.rows - 1
+/// FOR n := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+/// ENDFOR
+/// ENDFOR
+/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTCMMIMFP16PS instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_conjtcmmimfp16ps(dst, a, b) \
+ __builtin_ia32_tconjtcmmimfp16ps(dst, a, b)
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from \a a
+/// and writes the result to \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_conjtfp16(__tile dst, __tile a);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR i := 0 TO dst.rows - 1
+/// FOR j := 0 TO (dst.colsb / 4) - 1
+/// tmp.fp16[2*j+0] := a.row[j].fp16[2*i+0]
+/// tmp.fp16[2*j+1] := -a.row[j].fp16[2*i+1]
+/// ENDFOR
+/// write_row_and_zero(dst, i, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCONJTFP16 instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param a
+/// The source tile. Max size is 1024 Bytes.
+#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_tcmmrlfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_ttcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtcmmimfp16ps_internal(
+ unsigned short m, unsigned short n, unsigned short k, _tile1024i dst,
+ _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tconjtcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS _tile_conjtfp16_internal(
+ unsigned short m, unsigned short n, _tile1024i src) {
+ return __builtin_ia32_tconjtfp16_internal(m, n, src);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number
+/// with FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TTCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_tcmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_tcmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix conjugate transpose and multiplication of two tiles
+/// containing complex elements and accumulate the results into a packed
+/// single precision tile. Each dword element in input tiles src0 and src1
+/// is interpreted as a complex number with FP16 real part and FP16 imaginary
+/// part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtcmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_conjtcmmimfp16ps_internal(src0.row, src1.col, src0.col,
+ dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform conjugate transpose of an FP16-pair of complex elements from src and
+/// writes the result to dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCONJTFP16 </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src
+/// The source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS
+static void __tile_conjtfp16(__tile1024i *dst, __tile1024i src) {
+ dst->tile = _tile_conjtfp16_internal(src.row, src.col, src.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXTRANSPOSEINTRIN_H
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
index ed798245d41efb..bb4bc31fdafd50 100644
--- a/clang/lib/Headers/amxfp16intrin.h
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -15,6 +15,10 @@
#define __AMX_FP16INTRIN_H
#ifdef __x86_64__
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
/// and \a b, accumulating the intermediate single-precision (32-bit)
/// floating-point elements with elements in \a dst, and store the 32-bit
@@ -54,5 +58,36 @@
#define _tile_dpfp16ps(dst, a, b) \
__builtin_ia32_tdpfp16ps(dst, a, b)
+/// This is internal intrinsic. C/C++ user should avoid calli...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 01d233ff403823389f8480897e41aea84ecbb3d3 a8a00400bd5cf0b8a2d114708b20ffabd5b9bb9e --extensions c,cpp,h -- clang/lib/Headers/amxbf16transposeintrin.h clang/lib/Headers/amxcomplextransposeintrin.h clang/lib/Headers/amxfp16transposeintrin.h clang/lib/Headers/amxfp16intrin.h clang/lib/Headers/amxintrin.h clang/lib/Headers/immintrin.h clang/lib/Sema/SemaX86.cpp clang/test/CodeGen/X86/amx_transpose.c clang/test/CodeGen/X86/amx_transpose_api.c clang/test/CodeGen/X86/amx_transpose_errors.c llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86ISelLowering.cpp llvm/lib/Target/X86/X86LowerAMXType.cpp llvm/lib/Target/X86/X86RegisterInfo.cpp View the diff from clang-format here.diff --git a/clang/lib/Headers/amxcomplextransposeintrin.h b/clang/lib/Headers/amxcomplextransposeintrin.h
index 11abaf98e9..84883fdaee 100644
--- a/clang/lib/Headers/amxcomplextransposeintrin.h
+++ b/clang/lib/Headers/amxcomplextransposeintrin.h
@@ -45,11 +45,10 @@
/// tmp := dst.row[m]
/// FOR k := 0 TO a.rows - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
-/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-/// ENDFOR
-/// ENDFOR
-/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+1]) tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+0]) ENDFOR ENDFOR write_row_and_zero(dst, m, tmp,
+///dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
@@ -91,11 +90,10 @@
/// tmp := dst.row[m]
/// FOR k := 0 TO a.rows - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
-/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
-/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
-/// ENDFOR
-/// ENDFOR
-/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+0]) tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+1]) ENDFOR ENDFOR write_row_and_zero(dst, m, tmp,
+///dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
@@ -138,11 +136,10 @@
/// tmp := dst.row[m]
/// FOR k := 0 TO a.rows - 1
/// FOR n := 0 TO (dst.colsb / 4) - 1
-/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
-/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
-/// ENDFOR
-/// ENDFOR
-/// write_row_and_zero(dst, m, tmp, dst.colsb)
+/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///FP32(b.row[k].fp16[2*n+1]) tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) *
+///FP32(b.row[k].fp16[2*n+0]) ENDFOR ENDFOR write_row_and_zero(dst, m, tmp,
+///dst.colsb)
/// ENDFOR
/// zero_upper_rows(dst, dst.rows)
/// zero_tileconfig_start()
|
case Intrinsic::x86_ttdpfp16ps_internal: | ||
case Intrinsic::x86_ttcmmimfp16ps_internal: | ||
case Intrinsic::x86_ttcmmrlfp16ps_internal: | ||
case Intrinsic::x86_tconjtcmmimfp16ps_internal: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You may rebase the branch and merge the code with AMX-TF32 intrinsic (https://github.com/llvm/llvm-project/pull/115625/files#diff-69bcebccbbe093fbfcf4393d7115a823206ee71a8b98b78d83d12642e1afcc5aR279)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
#define _tile_tdpbf16ps(dst, a, b) __builtin_ia32_ttdpbf16ps(dst, a, b) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
__builtin_ia32_ttdpbf16ps(dst, a, b) -> __builtin_ia32_ttdpbf16ps((dst), (a), (b))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
#define _tile_tcmmimfp16ps(dst, a, b) __builtin_ia32_ttcmmimfp16ps(dst, a, b) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
#define _tile_tcmmrlfp16ps(dst, a, b) __builtin_ia32_ttcmmrlfp16ps(dst, a, b) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
#define _tile_conjtcmmimfp16ps(dst, a, b) \ | ||
__builtin_ia32_tconjtcmmimfp16ps(dst, a, b) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// The destination tile. Max size is 1024 Bytes. | ||
/// \param a | ||
/// The source tile. Max size is 1024 Bytes. | ||
#define _tile_conjtfp16(dst, a) __builtin_ia32_tconjtfp16(dst, a) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
/// The 1st source tile. Max size is 1024 Bytes. | ||
/// \param b | ||
/// The 2nd source tile. Max size is 1024 Bytes. | ||
#define _tile_tdpfp16ps(dst, a, b) __builtin_ia32_ttdpfp16ps(dst, a, b) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368