Skip to content

Clang: convert __m64 intrinsics to unconditionally use SSE2 instead of MMX. #96540

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")

// MMX+SSE2
TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14355,6 +14355,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vec_init_v2si:
return Builder.CreateBitCast(BuildVector(Ops),
llvm::Type::getX86_MMXTy(getLLVMContext()));
case X86::BI__builtin_ia32_vec_ext_v4hi:
case X86::BI__builtin_ia32_vec_ext_v2si:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v8hi:
Expand All @@ -14373,6 +14374,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// Otherwise we could just do this in the header file.
return Builder.CreateExtractElement(Ops[0], Index);
}
case X86::BI__builtin_ia32_vec_set_v4hi:
case X86::BI__builtin_ia32_vec_set_v16qi:
case X86::BI__builtin_ia32_vec_set_v8hi:
case X86::BI__builtin_ia32_vec_set_v4si:
Expand Down
42 changes: 20 additions & 22 deletions clang/lib/Headers/emmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,10 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
#endif

/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, \
__target__("sse2,no-evex512"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS_MMX \
__attribute__((__always_inline__, __nodebug__, \
__target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2,no-evex512"), __min_vector_width__(128)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why lose the 80-column?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No particular reason; I've reformatted the lines longer than 80 columns (these lines are now clang-format'd).

(Generally I don't worry about this sort of thing, because I just run clang-format on my change. However, this file's current text was so far from clang-format's preferred style that it added a whole lot of noise. So I had decided not to commit the formatted output. But then also didn't think about column widths.)


#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)

/// Adds lower double-precision values in both operands and returns the
/// sum in the lower 64 bits of the result. The upper 64 bits of the result
Expand Down Expand Up @@ -1486,8 +1484,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
}

/// Converts the two double-precision floating-point elements of a
Expand All @@ -1505,8 +1503,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
}

/// Converts the two signed 32-bit integer elements of a 64-bit vector of
Expand All @@ -1520,8 +1518,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
/// \param __a
/// A 64-bit vector of [2 x i32].
/// \returns A 128-bit vector of [2 x double] containing the converted values.
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
return __builtin_ia32_cvtpi2pd((__v2si)__a);
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) {
return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
}

/// Returns the low-order element of a 128-bit vector of [2 x double] as
Expand Down Expand Up @@ -2108,9 +2106,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
/// \param __b
/// A 64-bit integer.
/// \returns A 64-bit integer containing the sum of both parameters.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
__m64 __b) {
return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
}

/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
Expand Down Expand Up @@ -2431,9 +2428,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
/// \param __b
/// A 64-bit integer containing one of the source operands.
/// \returns A 64-bit integer vector containing the product of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
__m64 __b) {
return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
(__v4si)__anyext128(__b)));
}

/// Multiplies 32-bit unsigned integer values contained in the lower
Expand Down Expand Up @@ -2539,9 +2536,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
/// A 64-bit integer vector containing the subtrahend.
/// \returns A 64-bit integer vector containing the difference of the values in
/// the operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
__m64 __b) {
return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
return (__m64)((unsigned long long)__a - (unsigned long long)__b);
}

/// Subtracts the corresponding elements of two [2 x i64] vectors.
Expand Down Expand Up @@ -4889,8 +4885,10 @@ void _mm_pause(void);
#if defined(__cplusplus)
} // extern "C"
#endif

#undef __anyext128
#undef __trunc64
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX

#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))

Expand Down
Loading
Loading