From 68cb903594cd03dd708ef70c85c10807a6deefb5 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Jul 2024 13:17:28 +0100 Subject: [PATCH 001/486] Revert d43ec97de081755990264049eba09cb7c83cb321 "[X86] combineConcatVectorOps - IsConcatFree - peek through bitcasts to find inplace subvectors." I've been given reports of this causing infinite loops downstream - I'm going to revert for now while I investigate. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 ++++++++--------- llvm/test/CodeGen/X86/vselect-avx.ll | 17 ++++++++++++++--- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 881e06e5f78b4..b07662b67e3e7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56163,19 +56163,18 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, }; auto IsConcatFree = [](MVT VT, ArrayRef SubOps, unsigned Op) { bool AllConstants = true; - bool AllSubs = true; - unsigned VecSize = VT.getSizeInBits(); + bool AllSubVectors = true; for (unsigned I = 0, E = SubOps.size(); I != E; ++I) { - SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op)); - unsigned SubSize = BC.getValueSizeInBits(); - unsigned EltSize = BC.getScalarValueSizeInBits(); + SDValue Sub = SubOps[I].getOperand(Op); + unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); + SDValue BC = peekThroughBitcasts(Sub); AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode()); - AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR && - BC.getOperand(0).getValueSizeInBits() == VecSize && - (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize); + AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0).getValueType() == VT && + Sub.getConstantOperandAPInt(1) == (I * NumSubElts); } - return AllConstants || AllSubs; + return AllConstants || AllSubVectors; }; switch (Op0.getOpcode()) { diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll index 364390a4a60e5..bd26948766a56 100644 --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -259,7 +259,7 @@ define void @blendv_split(ptr %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, < ret void } -; Concatenate 128-bit pblendvb back together on AVX2+ targets (hidden by SSE __m128i bitcasts) +; TODO: Concatenate 128-bit pblendvb back together on AVX2+ targets (hidden by SSE __m128i bitcasts) define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { ; AVX1-LABEL: vselect_concat_split_v16i8: ; AVX1: ## %bb.0: @@ -277,13 +277,24 @@ define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i6 ; AVX2-LABEL: vselect_concat_split_v16i8: ; AVX2: ## %bb.0: ; AVX2-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm3 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: vselect_concat_split_v16i8: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpternlogq $216, %ymm2, %ymm1, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512-NEXT: ## kill: def $xmm1 killed $xmm1 killed $ymm1 def $ymm1 +; AVX512-NEXT: vpternlogq $226, %xmm0, %xmm2, %xmm1 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vpternlogq $226, %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0 ; AVX512-NEXT: retq %a.bc = bitcast <4 x i64> %a to <32 x i8> %b.bc = bitcast <4 x i64> %b to <32 x i8> From 8badfccefeaff2c05ef71a8d2fd6d803a1b4e129 Mon Sep 17 00:00:00 2001 From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com> Date: Thu, 18 Jul 2024 07:18:23 -0500 Subject: [PATCH 002/486] [libc] Add Multithreaded GPU Benchmarks (#98964) This PR runs benchmarks on a 32 threads (A single warp on NVPTX) by default, adding the option for single threaded benchmarks. We can specify that a benchmark should be run on a single thread using the `SINGLE_THREADED_BENCHMARK()` macro. I chose to use a flag here so that other options could be added in the future. --- libc/benchmarks/gpu/CMakeLists.txt | 1 + libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 9 +++++++-- libc/benchmarks/gpu/LibcGpuBenchmark.h | 19 ++++++++++++++++--- libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 2 ++ .../gpu/src/ctype/isalnum_benchmark.cpp | 4 ++++ 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 14ba9f3f64b48..29e27724e1ab3 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -10,6 +10,7 @@ function(add_benchmark benchmark_name) "LINK_LIBRARIES" # Multi-value arguments ${ARGN} ) + if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) message(FATAL_ERROR "target does not support clock") endif() diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 23fff3e8180f7..c926d8efd7db2 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -114,8 +114,13 @@ void Benchmark::run_benchmarks() { all_results.reset(); gpu::sync_threads(); - auto current_result = b->run(); - all_results.update(current_result); + if (!b->flags || + ((b->flags & BenchmarkFlags::SINGLE_THREADED) && id == 0) || + ((b->flags & BenchmarkFlags::SINGLE_WAVE) && + id < gpu::get_lane_size())) { + auto current_result = b->run(); + all_results.update(current_result); + } gpu::sync_threads(); if (id == 0) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 1f813f8655de6..29d7ba8b0a132 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -74,16 +74,19 @@ struct BenchmarkResult { clock_t total_time = 0; }; +enum BenchmarkFlags { SINGLE_THREADED = 0x1, SINGLE_WAVE = 0x2 }; + BenchmarkResult benchmark(const BenchmarkOptions &options, cpp::function wrapper_func); class Benchmark { const cpp::function func; const cpp::string_view name; + const uint8_t flags; public: - Benchmark(cpp::function func, char const *name) - : func(func), name(name) { + Benchmark(cpp::function func, char const *name, uint8_t flags) + : func(func), name(name), flags(flags) { add_benchmark(this); } @@ -104,6 +107,16 @@ class Benchmark { #define BENCHMARK(SuiteName, TestName, Func) \ LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ - Func, #SuiteName "." #TestName) + Func, #SuiteName "." #TestName, 0) + +#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName, \ + LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_THREADED) + +#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName, \ + LIBC_NAMESPACE::benchmarks::BenchmarkFlags::SINGLE_WAVE) #endif diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt index 79f01425770da..f277624dbb901 100644 --- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -8,6 +8,8 @@ add_benchmark( isalnum_benchmark.cpp DEPENDS libc.src.ctype.isalnum + LOADER_ARGS + --threads 64 ) add_benchmark( diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp index 6f8d247902f76..ffa5a99860bfc 100644 --- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp +++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp @@ -7,6 +7,10 @@ uint64_t BM_IsAlnum() { return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x); } BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnum, BM_IsAlnum); +SINGLE_THREADED_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleThread, + BM_IsAlnum); +SINGLE_WAVE_BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumSingleWave, + BM_IsAlnum); uint64_t BM_IsAlnumCapital() { char x = 'A'; From bf02f41726a48e5eb1dbe7c188f9e36ec6a29ac2 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Thu, 18 Jul 2024 08:23:41 -0400 Subject: [PATCH 003/486] Fix a regression with alignas on structure members in C (#98642) This was a 19.x regression and thus has no release note. Fixes #95032 --- clang/lib/Sema/ParsedAttr.cpp | 2 +- clang/lib/Sema/SemaDeclAttr.cpp | 56 ++++++++++++++++++++------------- clang/test/C/C2y/n3254.c | 28 ++++++++--------- clang/test/Sema/alignas.c | 19 +++++++++-- 4 files changed, 66 insertions(+), 39 deletions(-) diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp index 6abc90336c994..2109494aa5889 100644 --- a/clang/lib/Sema/ParsedAttr.cpp +++ b/clang/lib/Sema/ParsedAttr.cpp @@ -225,7 +225,7 @@ bool ParsedAttr::slidesFromDeclToDeclSpecLegacyBehavior() const { // atributes. return false; - assert(isStandardAttributeSyntax()); + assert(isStandardAttributeSyntax() || isAlignas()); // We have historically allowed some type attributes with standard attribute // syntax to slide to the decl-specifier-seq, so we have to keep supporting diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 20f46c003a464..41295bfb3b94f 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -6435,8 +6435,14 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, return; // Ignore C++11 attributes on declarator chunks: they appertain to the type - // instead. - if (AL.isCXX11Attribute() && !Options.IncludeCXX11Attributes) + // instead. Note, isCXX11Attribute() will look at whether the attribute is + // [[]] or alignas, while isC23Attribute() will only look at [[]]. This is + // important for ensuring that alignas in C23 is properly handled on a + // structure member declaration because it is a type-specifier-qualifier in + // C but still applies to the declaration rather than the type. + if ((S.getLangOpts().CPlusPlus ? AL.isCXX11Attribute() + : AL.isC23Attribute()) && + !Options.IncludeCXX11Attributes) return; // Unknown attributes are automatically warned on. Target-specific attributes @@ -7500,29 +7506,37 @@ void Sema::ProcessDeclAttributes(Scope *S, Decl *D, const Declarator &PD) { // Ordering of attributes can be important, so we take care to process // attributes in the order in which they appeared in the source code. + auto ProcessAttributesWithSliding = + [&](const ParsedAttributesView &Src, + const ProcessDeclAttributeOptions &Options) { + ParsedAttributesView NonSlidingAttrs; + for (ParsedAttr &AL : Src) { + // FIXME: this sliding is specific to standard attributes and should + // eventually be deprecated and removed as those are not intended to + // slide to anything. + if ((AL.isStandardAttributeSyntax() || AL.isAlignas()) && + AL.slidesFromDeclToDeclSpecLegacyBehavior()) { + // Skip processing the attribute, but do check if it appertains to + // the declaration. This is needed for the `MatrixType` attribute, + // which, despite being a type attribute, defines a `SubjectList` + // that only allows it to be used on typedef declarations. + AL.diagnoseAppertainsTo(*this, D); + } else { + NonSlidingAttrs.addAtEnd(&AL); + } + } + ProcessDeclAttributeList(S, D, NonSlidingAttrs, Options); + }; + // First, process attributes that appeared on the declaration itself (but // only if they don't have the legacy behavior of "sliding" to the DeclSepc). - ParsedAttributesView NonSlidingAttrs; - for (ParsedAttr &AL : PD.getDeclarationAttributes()) { - if (AL.slidesFromDeclToDeclSpecLegacyBehavior()) { - // Skip processing the attribute, but do check if it appertains to the - // declaration. This is needed for the `MatrixType` attribute, which, - // despite being a type attribute, defines a `SubjectList` that only - // allows it to be used on typedef declarations. - AL.diagnoseAppertainsTo(*this, D); - } else { - NonSlidingAttrs.addAtEnd(&AL); - } - } - ProcessDeclAttributeList(S, D, NonSlidingAttrs); + ProcessAttributesWithSliding(PD.getDeclarationAttributes(), {}); // Apply decl attributes from the DeclSpec if present. - if (!PD.getDeclSpec().getAttributes().empty()) { - ProcessDeclAttributeList(S, D, PD.getDeclSpec().getAttributes(), - ProcessDeclAttributeOptions() - .WithIncludeCXX11Attributes(false) - .WithIgnoreTypeAttributes(true)); - } + ProcessAttributesWithSliding(PD.getDeclSpec().getAttributes(), + ProcessDeclAttributeOptions() + .WithIncludeCXX11Attributes(false) + .WithIgnoreTypeAttributes(true)); // Walk the declarator structure, applying decl attributes that were in a type // position to the decl itself. This handles cases like: diff --git a/clang/test/C/C2y/n3254.c b/clang/test/C/C2y/n3254.c index e08659cf377aa..60d068cf9980b 100644 --- a/clang/test/C/C2y/n3254.c +++ b/clang/test/C/C2y/n3254.c @@ -21,9 +21,9 @@ struct S { // CHECK-LABEL: define dso_local i32 @foo( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 1 +// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 4 // CHECK-NEXT: [[S_PTR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[BUFFER]], i8 0, i64 12, i1 false) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[BUFFER]], i8 0, i64 12, i1 false) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[S_PTR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_PTR]], align 8 @@ -40,13 +40,13 @@ int foo() { // CHECK-LABEL: define dso_local signext i8 @bar( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 1 +// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 4 // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 1 -// CHECK-NEXT: store i8 97, ptr [[C]], align 1 +// CHECK-NEXT: store i8 97, ptr [[C]], align 4 // CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[C2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY1]], i32 0, i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[C2]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[C2]], align 4 // CHECK-NEXT: ret i8 [[TMP0]] // char bar() { @@ -58,13 +58,13 @@ char bar() { // CHECK-LABEL: define dso_local float @baz( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 1 +// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 4 // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 2 -// CHECK-NEXT: store float 3.000000e+00, ptr [[F]], align 1 +// CHECK-NEXT: store float 3.000000e+00, ptr [[F]], align 4 // CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[F2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDECAY1]], i32 0, i32 2 -// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F2]], align 1 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[F2]], align 4 // CHECK-NEXT: ret float [[TMP0]] // float baz() { @@ -80,9 +80,9 @@ struct T { // CHECK-LABEL: define dso_local signext i8 @quux( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_T:%.*]], align 1 +// CHECK-NEXT: [[T:%.*]] = alloca [[STRUCT_T:%.*]], align 4 // CHECK-NEXT: [[S_PTR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[T]], i8 0, i64 12, i1 false) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[T]], i8 0, i64 12, i1 false) // CHECK-NEXT: [[BUFFER:%.*]] = getelementptr inbounds [[STRUCT_T]], ptr [[T]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[S_PTR]], align 8 @@ -100,10 +100,10 @@ char quux() { // CHECK-LABEL: define dso_local float @quibble( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 1 +// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 4 // CHECK-NEXT: [[T_PTR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[S_PTR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[BUFFER]], i8 0, i64 12, i1 false) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[BUFFER]], i8 0, i64 12, i1 false) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: store ptr [[ARRAYDECAY]], ptr [[T_PTR]], align 8 // CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_PTR]], align 8 @@ -125,13 +125,13 @@ float quibble() { // CHECK-LABEL: define dso_local i32 @quorble( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 1 +// CHECK-NEXT: [[BUFFER:%.*]] = alloca [12 x i8], align 4 // CHECK-NEXT: [[S_PTR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[BUFFER1:%.*]] = getelementptr inbounds [[STRUCT_T:%.*]], ptr [[ARRAYDECAY]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER1]], i64 0, i64 0 // CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[ARRAYDECAY2]], i32 0, i32 0 -// CHECK-NEXT: store i32 12, ptr [[X]], align 1 +// CHECK-NEXT: store i32 12, ptr [[X]], align 4 // CHECK-NEXT: [[ARRAYDECAY3:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER]], i64 0, i64 0 // CHECK-NEXT: [[BUFFER4:%.*]] = getelementptr inbounds [[STRUCT_T]], ptr [[ARRAYDECAY3]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYDECAY5:%.*]] = getelementptr inbounds [12 x i8], ptr [[BUFFER4]], i64 0, i64 0 diff --git a/clang/test/Sema/alignas.c b/clang/test/Sema/alignas.c index 020eff6a141c0..391553bc540ec 100644 --- a/clang/test/Sema/alignas.c +++ b/clang/test/Sema/alignas.c @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -Dalignof=__alignof %s // RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -Dalignof=_Alignof -DUSING_C11_SYNTAX %s +// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 -DUSING_C11_SYNTAX %s _Alignas(3) int align_illegal; //expected-error {{requested alignment is not a power of 2}} _Alignas(int) char align_big; @@ -18,12 +19,24 @@ void f(_Alignas(1) char c) { // expected-error {{'_Alignas' attribute cannot be } #ifdef USING_C11_SYNTAX -// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}} -// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}} -// expected-warning@+4{{'_Alignof' applied to an expression is a GNU extension}} +// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}} +// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}} +// expected-warning-re@+4{{'{{(_A|a)}}lignof' applied to an expression is a GNU extension}} #endif _Static_assert(alignof(align_big) == alignof(int), "k's alignment is wrong"); _Static_assert(alignof(align_small) == 1, "j's alignment is wrong"); _Static_assert(alignof(align_multiple) == 8, "l's alignment is wrong"); _Static_assert(alignof(struct align_member) == 8, "quuux's alignment is wrong"); _Static_assert(sizeof(struct align_member) == 8, "quuux's size is wrong"); + +struct GH95032_1 { + _Alignas(16) char bytes[16]; +}; +_Static_assert(_Alignof(struct GH95032_1) == 16, ""); + +#if __STDC_VERSION__ >= 202311L +struct GH95032_2 { + alignas(16) char bytes[16]; +}; +static_assert(alignof(struct GH95032_2) == 16); +#endif From c5f402f95d9617882b26d5799f503383b895c2e4 Mon Sep 17 00:00:00 2001 From: Egor Zhdan Date: Thu, 18 Jul 2024 13:27:24 +0100 Subject: [PATCH 004/486] [APINotes] Reduce memory footprint for Obj-C/C++ contexts We were storing extraneous data for certain Objective-C/C++ entities. Specifically, for declarations that can be nested in another context (such as functions) we were storing the kind of the parent context in addition to its ID. The ID is always sufficient. This removes the logically incorrect usages of `ContextTableKey` that don't actually describe a context, but rather describe a single declaration. This introduces `SingleDeclTableKey` to store that kind of entities in a more compact and reasonable way. --- clang/lib/APINotes/APINotesFormat.h | 57 +++++++++++++++++++++++---- clang/lib/APINotes/APINotesReader.cpp | 32 ++++++--------- clang/lib/APINotes/APINotesWriter.cpp | 46 +++++++++++---------- 3 files changed, 84 insertions(+), 51 deletions(-) diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h index e3aa76df8316c..42dfe7a773a97 100644 --- a/clang/lib/APINotes/APINotesFormat.h +++ b/clang/lib/APINotes/APINotesFormat.h @@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0; /// API notes file minor version number. /// /// When the format changes IN ANY WAY, this number should be incremented. -const uint16_t VERSION_MINOR = 26; // SwiftCopyable +const uint16_t VERSION_MINOR = 27; // SingleDeclTableKey const uint8_t kSwiftCopyable = 1; const uint8_t kSwiftNonCopyable = 2; @@ -269,12 +269,6 @@ struct ContextTableKey { : parentContextID(parentContextID), contextKind(contextKind), contextID(contextID) {} - ContextTableKey(std::optional context, IdentifierID nameID) - : parentContextID(context ? context->id.Value : (uint32_t)-1), - contextKind(context ? static_cast(context->kind) - : static_cast(-1)), - contextID(nameID) {} - llvm::hash_code hashValue() const { return llvm::hash_value( std::tuple{parentContextID, contextKind, contextID}); @@ -286,6 +280,32 @@ inline bool operator==(const ContextTableKey &lhs, const ContextTableKey &rhs) { lhs.contextKind == rhs.contextKind && lhs.contextID == rhs.contextID; } +/// A stored Objective-C or C++ declaration, represented by the ID of its parent +/// context, and the name of the declaration. +struct SingleDeclTableKey { + uint32_t parentContextID; + uint32_t nameID; + + SingleDeclTableKey() : parentContextID(-1), nameID(-1) {} + + SingleDeclTableKey(uint32_t ParentContextID, uint32_t NameID) + : parentContextID(ParentContextID), nameID(NameID) {} + + SingleDeclTableKey(std::optional ParentCtx, IdentifierID NameID) + : parentContextID(ParentCtx ? ParentCtx->id.Value + : static_cast(-1)), + nameID(NameID) {} + + llvm::hash_code hashValue() const { + return llvm::hash_value(std::make_pair(parentContextID, nameID)); + } +}; + +inline bool operator==(const SingleDeclTableKey &lhs, + const SingleDeclTableKey &rhs) { + return lhs.parentContextID == rhs.parentContextID && lhs.nameID == rhs.nameID; +} + } // namespace api_notes } // namespace clang @@ -341,6 +361,29 @@ template <> struct DenseMapInfo { return lhs == rhs; } }; + +template <> struct DenseMapInfo { + static inline clang::api_notes::SingleDeclTableKey getEmptyKey() { + return clang::api_notes::SingleDeclTableKey(); + } + + static inline clang::api_notes::SingleDeclTableKey getTombstoneKey() { + return clang::api_notes::SingleDeclTableKey{ + DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey()}; + } + + static unsigned + getHashValue(const clang::api_notes::SingleDeclTableKey &value) { + return value.hashValue(); + } + + static bool isEqual(const clang::api_notes::SingleDeclTableKey &lhs, + const clang::api_notes::SingleDeclTableKey &rhs) { + return lhs == rhs; + } +}; + } // namespace llvm #endif diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp index 8454e092b55ac..7600738374840 100644 --- a/clang/lib/APINotes/APINotesReader.cpp +++ b/clang/lib/APINotes/APINotesReader.cpp @@ -429,15 +429,13 @@ class ObjCSelectorTableInfo { /// Used to deserialize the on-disk global variable table. class GlobalVariableTableInfo - : public VersionedTableInfo { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { auto CtxID = endian::readNext(Data); - auto ContextKind = - endian::readNext(Data); auto NameID = endian::readNext(Data); - return {CtxID, ContextKind, NameID}; + return {CtxID, NameID}; } hash_value_type ComputeHash(internal_key_type Key) { @@ -454,15 +452,13 @@ class GlobalVariableTableInfo /// Used to deserialize the on-disk global function table. class GlobalFunctionTableInfo - : public VersionedTableInfo { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { auto CtxID = endian::readNext(Data); - auto ContextKind = - endian::readNext(Data); auto NameID = endian::readNext(Data); - return {CtxID, ContextKind, NameID}; + return {CtxID, NameID}; } hash_value_type ComputeHash(internal_key_type Key) { @@ -501,15 +497,13 @@ class EnumConstantTableInfo /// Used to deserialize the on-disk tag table. class TagTableInfo - : public VersionedTableInfo { + : public VersionedTableInfo { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { auto CtxID = endian::readNext(Data); - auto ContextKind = - endian::readNext(Data); auto NameID = endian::readNext(Data); - return {CtxID, ContextKind, NameID}; + return {CtxID, NameID}; } hash_value_type ComputeHash(internal_key_type Key) { @@ -563,16 +557,14 @@ class TagTableInfo /// Used to deserialize the on-disk typedef table. class TypedefTableInfo - : public VersionedTableInfo { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { auto CtxID = endian::readNext(Data); - auto ContextKind = - endian::readNext(Data); auto nameID = endian::readNext(Data); - return {CtxID, ContextKind, nameID}; + return {CtxID, nameID}; } hash_value_type ComputeHash(internal_key_type Key) { @@ -1929,7 +1921,7 @@ auto APINotesReader::lookupGlobalVariable(llvm::StringRef Name, if (!NameID) return std::nullopt; - ContextTableKey Key(Ctx, *NameID); + SingleDeclTableKey Key(Ctx, *NameID); auto Known = Implementation->GlobalVariableTable->find(Key); if (Known == Implementation->GlobalVariableTable->end()) @@ -1948,7 +1940,7 @@ auto APINotesReader::lookupGlobalFunction(llvm::StringRef Name, if (!NameID) return std::nullopt; - ContextTableKey Key(Ctx, *NameID); + SingleDeclTableKey Key(Ctx, *NameID); auto Known = Implementation->GlobalFunctionTable->find(Key); if (Known == Implementation->GlobalFunctionTable->end()) @@ -1982,7 +1974,7 @@ auto APINotesReader::lookupTag(llvm::StringRef Name, std::optional Ctx) if (!NameID) return std::nullopt; - ContextTableKey Key(Ctx, *NameID); + SingleDeclTableKey Key(Ctx, *NameID); auto Known = Implementation->TagTable->find(Key); if (Known == Implementation->TagTable->end()) @@ -2001,7 +1993,7 @@ auto APINotesReader::lookupTypedef(llvm::StringRef Name, if (!NameID) return std::nullopt; - ContextTableKey Key(Ctx, *NameID); + SingleDeclTableKey Key(Ctx, *NameID); auto Known = Implementation->TypedefTable->find(Key); if (Known == Implementation->TypedefTable->end()) diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp index 4053d515ef426..1090d3f20df21 100644 --- a/clang/lib/APINotes/APINotesWriter.cpp +++ b/clang/lib/APINotes/APINotesWriter.cpp @@ -75,17 +75,17 @@ class APINotesWriter::Implementation { /// Information about global variables. /// - /// Indexed by the context ID, contextKind, identifier ID. + /// Indexed by the context ID, identifier ID. llvm::DenseMap< - ContextTableKey, + SingleDeclTableKey, llvm::SmallVector, 1>> GlobalVariables; /// Information about global functions. /// - /// Indexed by the context ID, contextKind, identifier ID. + /// Indexed by the context ID, identifier ID. llvm::DenseMap< - ContextTableKey, + SingleDeclTableKey, llvm::SmallVector, 1>> GlobalFunctions; @@ -98,15 +98,15 @@ class APINotesWriter::Implementation { /// Information about tags. /// - /// Indexed by the context ID, contextKind, identifier ID. - llvm::DenseMap, 1>> Tags; /// Information about typedefs. /// - /// Indexed by the context ID, contextKind, identifier ID. - llvm::DenseMap, 1>> Typedefs; @@ -865,18 +865,17 @@ void APINotesWriter::Implementation::writeObjCSelectorBlock( namespace { /// Used to serialize the on-disk global variable table. class GlobalVariableTableInfo - : public VersionedTableInfo { public: unsigned getKeyLength(key_type_ref) { - return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint32_t); + return sizeof(uint32_t) + sizeof(uint32_t); } void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) { llvm::support::endian::Writer writer(OS, llvm::endianness::little); writer.write(Key.parentContextID); - writer.write(Key.contextKind); - writer.write(Key.contextID); + writer.write(Key.nameID); } hash_value_type ComputeHash(key_type_ref Key) { @@ -979,18 +978,17 @@ void emitFunctionInfo(raw_ostream &OS, const FunctionInfo &FI) { /// Used to serialize the on-disk global function table. class GlobalFunctionTableInfo - : public VersionedTableInfo { public: unsigned getKeyLength(key_type_ref) { - return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint32_t); + return sizeof(uint32_t) + sizeof(uint32_t); } void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) { llvm::support::endian::Writer writer(OS, llvm::endianness::little); writer.write(Key.parentContextID); - writer.write(Key.contextKind); - writer.write(Key.contextID); + writer.write(Key.nameID); } hash_value_type ComputeHash(key_type_ref Key) { @@ -1091,20 +1089,20 @@ void APINotesWriter::Implementation::writeEnumConstantBlock( namespace { template class CommonTypeTableInfo - : public VersionedTableInfo { + : public VersionedTableInfo { public: using key_type_ref = typename CommonTypeTableInfo::key_type_ref; using hash_value_type = typename CommonTypeTableInfo::hash_value_type; unsigned getKeyLength(key_type_ref) { - return sizeof(uint32_t) + sizeof(uint8_t) + sizeof(IdentifierID); + return sizeof(uint32_t) + sizeof(IdentifierID); } void EmitKey(raw_ostream &OS, key_type_ref Key, unsigned) { llvm::support::endian::Writer writer(OS, llvm::endianness::little); writer.write(Key.parentContextID); - writer.write(Key.contextKind); - writer.write(Key.contextID); + writer.write(Key.nameID); } hash_value_type ComputeHash(key_type_ref Key) { @@ -1351,7 +1349,7 @@ void APINotesWriter::addGlobalVariable(std::optional Ctx, const GlobalVariableInfo &Info, VersionTuple SwiftVersion) { IdentifierID VariableID = Implementation->getIdentifier(Name); - ContextTableKey Key(Ctx, VariableID); + SingleDeclTableKey Key(Ctx, VariableID); Implementation->GlobalVariables[Key].push_back({SwiftVersion, Info}); } @@ -1360,7 +1358,7 @@ void APINotesWriter::addGlobalFunction(std::optional Ctx, const GlobalFunctionInfo &Info, VersionTuple SwiftVersion) { IdentifierID NameID = Implementation->getIdentifier(Name); - ContextTableKey Key(Ctx, NameID); + SingleDeclTableKey Key(Ctx, NameID); Implementation->GlobalFunctions[Key].push_back({SwiftVersion, Info}); } @@ -1374,7 +1372,7 @@ void APINotesWriter::addEnumConstant(llvm::StringRef Name, void APINotesWriter::addTag(std::optional Ctx, llvm::StringRef Name, const TagInfo &Info, VersionTuple SwiftVersion) { IdentifierID TagID = Implementation->getIdentifier(Name); - ContextTableKey Key(Ctx, TagID); + SingleDeclTableKey Key(Ctx, TagID); Implementation->Tags[Key].push_back({SwiftVersion, Info}); } @@ -1382,7 +1380,7 @@ void APINotesWriter::addTypedef(std::optional Ctx, llvm::StringRef Name, const TypedefInfo &Info, VersionTuple SwiftVersion) { IdentifierID TypedefID = Implementation->getIdentifier(Name); - ContextTableKey Key(Ctx, TypedefID); + SingleDeclTableKey Key(Ctx, TypedefID); Implementation->Typedefs[Key].push_back({SwiftVersion, Info}); } } // namespace api_notes From 39bb244a16e59cb8f2080f96e9de599007762635 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Thu, 18 Jul 2024 20:49:53 +0800 Subject: [PATCH 005/486] [SLP][REVEC] Make Instruction::Call support vector instructions. (#99317) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- llvm/test/Transforms/SLPVectorizer/revec.ll | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b994645cece61..d8c3bae06e932 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -13499,7 +13499,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } ScalarArg = CEI->getArgOperand(I); if (cast(OpVec->getType())->getElementType() != - ScalarArg->getType() && + ScalarArg->getType()->getScalarType() && It == MinBWs.end()) { auto *CastTy = getWidenedType(ScalarArg->getType(), VecTy->getNumElements()); diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index 4b37b100763a9..c2dc6d0ab73b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -38,3 +38,23 @@ entry: store <4 x i32> %add.i65, ptr %arrayidx42, align 4 ret void } + +define void @test2(ptr %in, ptr %out) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i16>, ptr [[IN:%.*]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP0]], <16 x i16> [[TMP0]]) +; CHECK-NEXT: store <16 x i16> [[TMP1]], ptr [[OUT:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %0 = getelementptr i16, ptr %in, i64 8 + %1 = load <8 x i16>, ptr %in, align 2 + %2 = load <8 x i16>, ptr %0, align 2 + %3 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %1, <8 x i16> %1) + %4 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %2, <8 x i16> %2) + %5 = getelementptr i16, ptr %out, i64 8 + store <8 x i16> %3, ptr %out, align 2 + store <8 x i16> %4, ptr %5, align 2 + ret void +} From 27ec379f636ceac655faa290e78735ea98e02cbf Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Jul 2024 13:53:22 +0100 Subject: [PATCH 006/486] [AMDGPU] Do not select llvm.amdgcn.inverse.ballot with wrong wave size (#99470) This produces a "cannot select" error, instead of failing later with an illegal vgpr to sgpr copy. --- llvm/lib/Target/AMDGPU/SIInstructions.td | 2 ++ .../CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll | 10 ++++++++-- .../CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll | 10 ++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f2721fbd164bf..2e617e5646c59 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -212,11 +212,13 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { } let usesCustomInserter = 1 in { +let WaveSizePredicate = isWave32 in def S_INVERSE_BALLOT_U32 : SPseudoInstSI< (outs SReg_32:$sdst), (ins SSrc_b32:$mask), [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))] >; +let WaveSizePredicate = isWave64 in def S_INVERSE_BALLOT_U64 : SPseudoInstSI< (outs SReg_64:$sdst), (ins SSrc_b64:$mask), [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index 71ed71cd84bcd..3781faa54e7dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s + +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s + +; GISEL-ERR: LLVM ERROR: cannot select: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.inverse.ballot) +; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.inverse.ballot declare i1 @llvm.amdgcn.inverse.ballot(i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index 2e3dc11feed1e..29218a3625216 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s + +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -global-isel=1 < %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s +; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -global-isel=0 < %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s + +; GISEL-ERR: LLVM ERROR: cannot select: {{.*}} = G_INTRINSIC intrinsic(@llvm.amdgcn.inverse.ballot) +; SDAG-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.inverse.ballot declare i1 @llvm.amdgcn.inverse.ballot.i64(i64) From 7f2bd53b142ec147da9f8bd98775be1d14457ba1 Mon Sep 17 00:00:00 2001 From: Robin Caloudis Date: Thu, 18 Jul 2024 15:01:09 +0200 Subject: [PATCH 007/486] [libc++] Fix acceptance of convertible-to-{float,double,long double} in std::isfinite() (#98841) Closes https://github.com/llvm/llvm-project/issues/98816. --- libcxx/include/__math/traits.h | 12 ++++++++++++ libcxx/test/std/numerics/c.math/isfinite.pass.cpp | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/libcxx/include/__math/traits.h b/libcxx/include/__math/traits.h index a448266797557..27ec52ecef022 100644 --- a/libcxx/include/__math/traits.h +++ b/libcxx/include/__math/traits.h @@ -55,6 +55,18 @@ _LIBCPP_NODISCARD _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfin return true; } +_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(float __x) _NOEXCEPT { + return __builtin_isfinite(__x); +} + +_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(double __x) _NOEXCEPT { + return __builtin_isfinite(__x); +} + +_LIBCPP_NODISCARD inline _LIBCPP_CONSTEXPR_SINCE_CXX23 _LIBCPP_HIDE_FROM_ABI bool isfinite(long double __x) _NOEXCEPT { + return __builtin_isfinite(__x); +} + // isinf template ::value && numeric_limits<_A1>::has_infinity, int> = 0> diff --git a/libcxx/test/std/numerics/c.math/isfinite.pass.cpp b/libcxx/test/std/numerics/c.math/isfinite.pass.cpp index 6bbc3aaac6d13..3d5be61634334 100644 --- a/libcxx/test/std/numerics/c.math/isfinite.pass.cpp +++ b/libcxx/test/std/numerics/c.math/isfinite.pass.cpp @@ -62,9 +62,21 @@ struct TestInt { } }; +template +struct ConvertibleTo { + operator T() const { return T(); } +}; + int main(int, char**) { types::for_each(types::floating_point_types(), TestFloat()); types::for_each(types::integral_types(), TestInt()); + // Make sure we can call `std::isfinite` with convertible types + { + assert(std::isfinite(ConvertibleTo())); + assert(std::isfinite(ConvertibleTo())); + assert(std::isfinite(ConvertibleTo())); + } + return 0; } From 6a141610f1fc3a53b5b1fd86fa996a90f5c1b849 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Jul 2024 14:02:13 +0100 Subject: [PATCH 008/486] [X86] Add getGFNICtrlMask helper for the constant creation and bitcasting. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b07662b67e3e7..56d08e7f76908 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29096,6 +29096,16 @@ uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) { llvm_unreachable("Unsupported GFNI opcode"); } +// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate. +SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, + unsigned Amt = 0) { + assert(VT.getVectorElementType() == MVT::i8 && + (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type"); + uint64_t Imm = getGFNICtrlImm(Opcode, Amt); + MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + return DAG.getBitcast(VT, DAG.getConstant(Imm, DL, MaskVT)); +} + // Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, @@ -29284,9 +29294,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, return SDValue(); if (Subtarget.hasGFNI()) { - uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt); - MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8); - SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT)); + SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt); return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask, DAG.getTargetConstant(0, dl, MVT::i8)); } @@ -30191,9 +30199,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); - uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt); - MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT)); + SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt); return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask, DAG.getTargetConstant(0, DL, MVT::i8)); } @@ -31528,10 +31534,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. if (Subtarget.hasGFNI()) { - MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); - SDValue Matrix = - DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT); - Matrix = DAG.getBitcast(VT, Matrix); + SDValue Matrix = getGFNICtrlMask(ISD::BITREVERSE, DAG, DL, VT); return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, DAG.getTargetConstant(0, DL, MVT::i8)); } From fde51e24b593087a15494203ef68a77bb4f12115 Mon Sep 17 00:00:00 2001 From: Janet Cobb Date: Thu, 18 Jul 2024 09:04:15 -0400 Subject: [PATCH 009/486] [libc++][test] Raise a useful error when no -std=c++NN flag is found to work (#99423) Recently ran into an issue with symptoms very similar to https://github.com/llvm/llvm-project/issues/56816 while attempting to build and test libc++ on NixOS. The error message is cryptic (just `StopIteration`), which was very annoying to track down. The error at least saying "hey your compiler's bad" would have saved me quite a bit of time figuring out the issue. --- libcxx/utils/libcxx/test/params.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index 3aadc54cf92dc..13c7297fd7304 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -88,6 +88,17 @@ def getStdFlag(cfg, std): return None +def getDefaultStdValue(cfg): + viable = [s for s in reversed(_allStandards) if getStdFlag(cfg, s)] + + if not viable: + raise RuntimeError( + "Unable to successfully detect the presence of any -std=c++NN flag. This likely indicates an issue with your compiler." + ) + + return viable[0] + + def getSpeedOptimizationFlag(cfg): if _isClang(cfg) or _isAppleClang(cfg) or _isGCC(cfg): return "-O3" @@ -170,9 +181,7 @@ def getSuitableClangTidy(cfg): choices=_allStandards, type=str, help="The version of the standard to compile the test suite with.", - default=lambda cfg: next( - s for s in reversed(_allStandards) if getStdFlag(cfg, s) - ), + default=lambda cfg: getDefaultStdValue(cfg), actions=lambda std: [ AddFeature(std), AddSubstitution("%{cxx_std}", re.sub(r"\+", "x", std)), From 3eb666e292baf87c969be733de858b0cb7ead13f Mon Sep 17 00:00:00 2001 From: Krasimir Georgiev Date: Thu, 18 Jul 2024 13:04:59 +0000 Subject: [PATCH 010/486] update bazel for a6d2da8b9d7be19816dd4c76b02016c19618c1be --- .../llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index a6f9d4f2fdac2..4575c40bc76c5 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -104,16 +104,19 @@ libc_support_library( name = "qsort_test_helper", hdrs = ["SortingTest.h"], deps = [ + "//libc:__support_macros_config", "//libc:qsort_util", "//libc/test/UnitTest:LibcUnitTest", ], ) + libc_test( name = "qsort_test", srcs = ["qsort_test.cpp"], libc_function_deps = ["//libc:qsort"], deps = [":qsort_test_helper"], ) + libc_test( name = "quick_sort_test", srcs = ["quick_sort_test.cpp"], @@ -122,6 +125,7 @@ libc_test( "//libc:qsort_util", ], ) + libc_test( name = "heap_sort_test", srcs = ["heap_sort_test.cpp"], From fc65a9603bf16ed1fe98fbee6933bca9e2083384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 07:11:00 +0200 Subject: [PATCH 011/486] [clang][Interp] Run record destructors when deallocating dynamic memory --- clang/lib/AST/Interp/Interp.cpp | 75 ++++++++++++++++++++++++++++ clang/lib/AST/Interp/Interp.h | 6 ++- clang/test/AST/Interp/new-delete.cpp | 73 +++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index fb63228f8aea8..2be9b5360d055 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -819,6 +819,81 @@ bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F, return true; } +// FIXME: This is similar to code we already have in Compiler.cpp. +// I think it makes sense to instead add the field and base destruction stuff +// to the destructor Function itself. Then destroying a record would really +// _just_ be calling its destructor. That would also help with the diagnostic +// difference when the destructor or a field/base fails. +static bool runRecordDestructor(InterpState &S, CodePtr OpPC, + const Pointer &BasePtr, + const Descriptor *Desc) { + assert(Desc->isRecord()); + const Record *R = Desc->ElemRecord; + assert(R); + + // Fields. + for (const Record::Field &Field : llvm::reverse(R->fields())) { + const Descriptor *D = Field.Desc; + if (D->isRecord()) { + if (!runRecordDestructor(S, OpPC, BasePtr.atField(Field.Offset), D)) + return false; + } else if (D->isCompositeArray()) { + const Descriptor *ElemDesc = Desc->ElemDesc; + assert(ElemDesc->isRecord()); + for (unsigned I = 0; I != Desc->getNumElems(); ++I) { + if (!runRecordDestructor(S, OpPC, BasePtr.atIndex(I).narrow(), + ElemDesc)) + return false; + } + } + } + + // Destructor of this record. + if (const CXXDestructorDecl *Dtor = R->getDestructor(); + Dtor && !Dtor->isTrivial()) { + const Function *DtorFunc = S.getContext().getOrCreateFunction(Dtor); + if (!DtorFunc) + return false; + + S.Stk.push(BasePtr); + if (!Call(S, OpPC, DtorFunc, 0)) + return false; + } + + // Bases. + for (const Record::Base &Base : llvm::reverse(R->bases())) { + if (!runRecordDestructor(S, OpPC, BasePtr.atField(Base.Offset), Base.Desc)) + return false; + } + + return true; +} + +bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B) { + assert(B); + const Descriptor *Desc = B->getDescriptor(); + + if (Desc->isPrimitive() || Desc->isPrimitiveArray()) + return true; + + assert(Desc->isRecord() || Desc->isCompositeArray()); + + if (Desc->isCompositeArray()) { + const Descriptor *ElemDesc = Desc->ElemDesc; + assert(ElemDesc->isRecord()); + + Pointer RP(const_cast(B)); + for (unsigned I = 0; I != Desc->getNumElems(); ++I) { + if (!runRecordDestructor(S, OpPC, RP.atIndex(I).narrow(), ElemDesc)) + return false; + } + return true; + } + + assert(Desc->isRecord()); + return runRecordDestructor(S, OpPC, Pointer(const_cast(B)), Desc); +} + bool Interpret(InterpState &S, APValue &Result) { // The current stack frame when we started Interpret(). // This is being used by the ops to determine wheter diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index b4f8c03280c85..17b3157cb40a9 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2872,8 +2872,8 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc, return true; } +bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B); static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) { - if (!CheckDynamicMemoryAllocation(S, OpPC)) return false; @@ -2904,6 +2904,10 @@ static inline bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm) { assert(Source); assert(BlockToDelete); + // Invoke destructors before deallocating the memory. + if (!RunDestructors(S, OpPC, BlockToDelete)) + return false; + DynamicAllocator &Allocator = S.getAllocator(); bool WasArrayAlloc = Allocator.isArrayAllocation(Source); const Descriptor *BlockDesc = BlockToDelete->getDescriptor(); diff --git a/clang/test/AST/Interp/new-delete.cpp b/clang/test/AST/Interp/new-delete.cpp index 04ce3ae5f6637..cb46426c0e3be 100644 --- a/clang/test/AST/Interp/new-delete.cpp +++ b/clang/test/AST/Interp/new-delete.cpp @@ -476,7 +476,80 @@ constexpr Sp ss[] = {Sp{new int{154}}}; // both-error {{must be initialized by a // both-note {{pointer to heap-allocated object}} \ // both-note {{allocation performed here}} +namespace DeleteRunsDtors { + struct InnerFoo { + int *mem; + constexpr ~InnerFoo() { + delete mem; + } + }; + + struct Foo { + int *a; + InnerFoo IF; + + constexpr Foo() { + a = new int(13); + IF.mem = new int(100); + } + constexpr ~Foo() { delete a; } + }; + + constexpr int abc() { + Foo *F = new Foo(); + int n = *F->a; + delete F; + + return n; + } + static_assert(abc() == 13); + + constexpr int abc2() { + Foo *f = new Foo[3]; + + delete[] f; + + return 1; + } + static_assert(abc2() == 1); +} + +/// FIXME: There is a slight difference in diagnostics here, because we don't +/// create a new frame when we delete record fields or bases at all. +namespace FaultyDtorCalledByDelete { + struct InnerFoo { + int *mem; + constexpr ~InnerFoo() { + if (mem) { + (void)(1/0); // both-warning {{division by zero is undefined}} \ + // both-note {{division by zero}} + } + delete mem; + } + }; + + struct Foo { + int *a; + InnerFoo IF; + constexpr Foo() { + a = new int(13); + IF.mem = new int(100); + } + constexpr ~Foo() { delete a; } + }; + + constexpr int abc() { + Foo *F = new Foo(); + int n = *F->a; + delete F; // both-note {{in call to}} \ + // ref-note {{in call to}} + + return n; + } + static_assert(abc() == 13); // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'abc()'}} +} #else From 4dfa75c663e53be1d548b340e562dd5c4e87fe65 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 18 Jul 2024 15:17:22 +0200 Subject: [PATCH 012/486] [libc++] Merge is_scoped_enum.h into is_enum.h (#99458) --- libcxx/include/CMakeLists.txt | 1 - libcxx/include/__type_traits/is_enum.h | 10 ++++++ libcxx/include/__type_traits/is_scoped_enum.h | 33 ------------------- libcxx/include/module.modulemap | 1 - libcxx/include/type_traits | 4 --- 5 files changed, 10 insertions(+), 39 deletions(-) delete mode 100644 libcxx/include/__type_traits/is_scoped_enum.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index cd64fe91449c2..26bad4f656a07 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -793,7 +793,6 @@ set(files __type_traits/is_referenceable.h __type_traits/is_same.h __type_traits/is_scalar.h - __type_traits/is_scoped_enum.h __type_traits/is_signed.h __type_traits/is_signed_integer.h __type_traits/is_specialization.h diff --git a/libcxx/include/__type_traits/is_enum.h b/libcxx/include/__type_traits/is_enum.h index 77ca3ea108742..2fab6db2c8d50 100644 --- a/libcxx/include/__type_traits/is_enum.h +++ b/libcxx/include/__type_traits/is_enum.h @@ -26,6 +26,16 @@ template inline constexpr bool is_enum_v = __is_enum(_Tp); #endif +#if _LIBCPP_STD_VER >= 23 + +template +struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {}; + +template +inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp); + +#endif // _LIBCPP_STD_VER >= 23 + _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___TYPE_TRAITS_IS_ENUM_H diff --git a/libcxx/include/__type_traits/is_scoped_enum.h b/libcxx/include/__type_traits/is_scoped_enum.h deleted file mode 100644 index cb3e25cf57331..0000000000000 --- a/libcxx/include/__type_traits/is_scoped_enum.h +++ /dev/null @@ -1,33 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H -#define _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H - -#include <__config> -#include <__type_traits/integral_constant.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -#if _LIBCPP_STD_VER >= 23 - -template -struct _LIBCPP_TEMPLATE_VIS is_scoped_enum : bool_constant<__is_scoped_enum(_Tp)> {}; - -template -inline constexpr bool is_scoped_enum_v = __is_scoped_enum(_Tp); - -#endif // _LIBCPP_STD_VER >= 23 - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_SCOPED_ENUM_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 7608aef3f3a43..5ed284d80f35e 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -2009,7 +2009,6 @@ module std_private_type_traits_is_scalar [system header "__type_traits/is_scalar.h" export std_private_type_traits_is_null_pointer } -module std_private_type_traits_is_scoped_enum [system] { header "__type_traits/is_scoped_enum.h" } module std_private_type_traits_is_signed [system] { header "__type_traits/is_signed.h" } module std_private_type_traits_is_signed_integer [system] { header "__type_traits/is_signed_integer.h" } module std_private_type_traits_is_specialization [system] { header "__type_traits/is_specialization.h" } diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index ffa137338b6a2..7f231cd09df51 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -514,10 +514,6 @@ namespace std # include <__type_traits/unwrap_ref.h> #endif -#if _LIBCPP_STD_VER >= 23 -# include <__type_traits/is_scoped_enum.h> -#endif - #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) From 561246e90282a72b5b0c437cbbdae171526aad8f Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 18 Jul 2024 12:21:31 +0200 Subject: [PATCH 013/486] [libc++][NFC] Remove wrong #endif comment --- libcxx/include/__type_traits/remove_cv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/__type_traits/remove_cv.h b/libcxx/include/__type_traits/remove_cv.h index 2c4e9e419a1be..50e9f3e8aa78d 100644 --- a/libcxx/include/__type_traits/remove_cv.h +++ b/libcxx/include/__type_traits/remove_cv.h @@ -28,7 +28,7 @@ using __remove_cv_t = typename remove_cv<_Tp>::type; #else template using __remove_cv_t = __remove_cv(_Tp); -#endif // __has_builtin(__remove_cv) +#endif #if _LIBCPP_STD_VER >= 14 template From 15495b8cd4051d05c1b88c919e7c509a8ea4056a Mon Sep 17 00:00:00 2001 From: Johannes Reifferscheid Date: Thu, 18 Jul 2024 15:20:57 +0200 Subject: [PATCH 014/486] [mlir] Fix unused-variable warning w/o assertions. (#99489) --- mlir/test/CAPI/rewrite.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/test/CAPI/rewrite.c b/mlir/test/CAPI/rewrite.c index a8b593eabb781..b33d225767046 100644 --- a/mlir/test/CAPI/rewrite.c +++ b/mlir/test/CAPI/rewrite.c @@ -68,6 +68,8 @@ void testInsertionPoint(MlirContext ctx) { // Get insertion blocks MlirBlock block1 = mlirRewriterBaseGetBlock(rewriter); MlirBlock block2 = mlirRewriterBaseGetInsertionBlock(rewriter); + (void)block1; + (void)block2; assert(body.ptr == block1.ptr); assert(body.ptr == block2.ptr); From d00b35534d068510025d22e5bd9c4fdac45757fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 09:01:42 +0200 Subject: [PATCH 015/486] [clang][Interp] Fix CheckCallable for undefined-and-not-constexpr fns --- clang/lib/AST/Interp/Interp.cpp | 86 +++++++++++++++++---------------- clang/lib/AST/Interp/Interp.h | 6 +-- clang/test/AST/Interp/cxx2a.cpp | 15 ++++++ 3 files changed, 63 insertions(+), 44 deletions(-) create mode 100644 clang/test/AST/Interp/cxx2a.cpp diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 2be9b5360d055..be47f72e65a29 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -579,57 +579,61 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { return false; } - if (!F->isConstexpr() || !F->hasBody()) { - const SourceLocation &Loc = S.Current->getLocation(OpPC); - if (S.getLangOpts().CPlusPlus11) { - const FunctionDecl *DiagDecl = F->getDecl(); + if (F->isConstexpr() && F->hasBody() && F->getDecl()->isConstexpr()) + return true; - // Invalid decls have been diagnosed before. - if (DiagDecl->isInvalidDecl()) - return false; + // Implicitly constexpr. + if (F->isLambdaStaticInvoker()) + return true; - // If this function is not constexpr because it is an inherited - // non-constexpr constructor, diagnose that directly. - const auto *CD = dyn_cast(DiagDecl); - if (CD && CD->isInheritingConstructor()) { - const auto *Inherited = CD->getInheritedConstructor().getConstructor(); - if (!Inherited->isConstexpr()) - DiagDecl = CD = Inherited; - } + const SourceLocation &Loc = S.Current->getLocation(OpPC); + if (S.getLangOpts().CPlusPlus11) { + const FunctionDecl *DiagDecl = F->getDecl(); + + // Invalid decls have been diagnosed before. + if (DiagDecl->isInvalidDecl()) + return false; + + // If this function is not constexpr because it is an inherited + // non-constexpr constructor, diagnose that directly. + const auto *CD = dyn_cast(DiagDecl); + if (CD && CD->isInheritingConstructor()) { + const auto *Inherited = CD->getInheritedConstructor().getConstructor(); + if (!Inherited->isConstexpr()) + DiagDecl = CD = Inherited; + } - // FIXME: If DiagDecl is an implicitly-declared special member function - // or an inheriting constructor, we should be much more explicit about why - // it's not constexpr. - if (CD && CD->isInheritingConstructor()) { - S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) + // FIXME: If DiagDecl is an implicitly-declared special member function + // or an inheriting constructor, we should be much more explicit about why + // it's not constexpr. + if (CD && CD->isInheritingConstructor()) { + S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) << CD->getInheritedConstructor().getConstructor()->getParent(); - S.Note(DiagDecl->getLocation(), diag::note_declared_at); - } else { - // Don't emit anything if the function isn't defined and we're checking - // for a constant expression. It might be defined at the point we're - // actually calling it. - bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; - if (!DiagDecl->isDefined() && !IsExtern && - S.checkingPotentialConstantExpression()) - return false; + S.Note(DiagDecl->getLocation(), diag::note_declared_at); + } else { + // Don't emit anything if the function isn't defined and we're checking + // for a constant expression. It might be defined at the point we're + // actually calling it. + bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; + if (!DiagDecl->isDefined() && !IsExtern && DiagDecl->isConstexpr() && + S.checkingPotentialConstantExpression()) + return false; - // If the declaration is defined, declared 'constexpr' _and_ has a body, - // the below diagnostic doesn't add anything useful. - if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && - DiagDecl->hasBody()) - return false; + // If the declaration is defined, declared 'constexpr' _and_ has a body, + // the below diagnostic doesn't add anything useful. + if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && + DiagDecl->hasBody()) + return false; - S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) + S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) << DiagDecl->isConstexpr() << (bool)CD << DiagDecl; - S.Note(DiagDecl->getLocation(), diag::note_declared_at); - } - } else { - S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); + S.Note(DiagDecl->getLocation(), diag::note_declared_at); } - return false; + } else { + S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); } - return true; + return false; } bool CheckCallDepth(InterpState &S, CodePtr OpPC) { diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 17b3157cb40a9..2e159012f5ffd 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2531,14 +2531,14 @@ inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func, if (!CheckInvoke(S, OpPC, ThisPtr)) return false; } - - if (S.checkingPotentialConstantExpression()) - return false; } if (!CheckCallable(S, OpPC, Func)) return false; + if (Func->hasThisPointer() && S.checkingPotentialConstantExpression()) + return false; + if (!CheckCallDepth(S, OpPC)) return false; diff --git a/clang/test/AST/Interp/cxx2a.cpp b/clang/test/AST/Interp/cxx2a.cpp new file mode 100644 index 0000000000000..27d1aa1a27f75 --- /dev/null +++ b/clang/test/AST/Interp/cxx2a.cpp @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=ref,both %s +// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter + +template +struct S { + S() requires (N==1) = default; + S() requires (N==2) {} // both-note {{declared here}} + consteval S() requires (N==3) = default; +}; + +consteval int aConstevalFunction() { // both-error {{consteval function never produces a constant expression}} + S<2> s4; // both-note {{non-constexpr constructor 'S' cannot be used in a constant expression}} + return 0; +} +/// We're NOT calling the above function. The diagnostics should appear anyway. From d9cb65ff483a2f79d0d3f0239796abe829372e52 Mon Sep 17 00:00:00 2001 From: Romaric Jodin Date: Thu, 18 Jul 2024 15:28:58 +0200 Subject: [PATCH 016/486] libclc: fix convert with half (#99481) Fix following update of libclc introducing more fp16 support: https://github.com/llvm/llvm-project/commit/7e6a73959ae97b1f9476a90290a492ba90cb950d --- libclc/generic/include/clc/convert.h | 1 + libclc/generic/lib/gen_convert.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/libclc/generic/include/clc/convert.h b/libclc/generic/include/clc/convert.h index db7bb0402491e..8219df47ad2c6 100644 --- a/libclc/generic/include/clc/convert.h +++ b/libclc/generic/include/clc/convert.h @@ -62,6 +62,7 @@ #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ _CLC_VECTOR_CONVERT_TO1(SUFFIX) \ _CLC_VECTOR_CONVERT_FROM(half, SUFFIX) +#else #define _CLC_VECTOR_CONVERT_TO(SUFFIX) \ _CLC_VECTOR_CONVERT_TO1(SUFFIX) #endif diff --git a/libclc/generic/lib/gen_convert.py b/libclc/generic/lib/gen_convert.py index bd36faa4e9197..41bd8cebf88b6 100644 --- a/libclc/generic/lib/gen_convert.py +++ b/libclc/generic/lib/gen_convert.py @@ -142,7 +142,10 @@ def conditional_guard(src, dst): float64_count = float64_count + 1 elif dst in float16_types: float16_count = float16_count + 1 - if float64_count > 0: + if float64_count > 0 and float16_count > 0: + print("#if defined(cl_khr_fp16) && defined(cl_khr_fp64)") + return True + elif float64_count > 0: # In embedded profile, if cl_khr_fp64 is supported cles_khr_int64 has to be print("#ifdef cl_khr_fp64") return True From ad7aeb0ff58ebd29f68adb85c64e8010639e2a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 15:29:45 +0200 Subject: [PATCH 017/486] Revert "[clang][Interp] Fix CheckCallable for undefined-and-not-constexpr fns" This reverts commit d00b35534d068510025d22e5bd9c4fdac45757fb. This breaks the ms-constexpr test: https://lab.llvm.org/buildbot/#/builders/144/builds/2605 --- clang/lib/AST/Interp/Interp.cpp | 86 ++++++++++++++++----------------- clang/lib/AST/Interp/Interp.h | 6 +-- clang/test/AST/Interp/cxx2a.cpp | 15 ------ 3 files changed, 44 insertions(+), 63 deletions(-) delete mode 100644 clang/test/AST/Interp/cxx2a.cpp diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index be47f72e65a29..2be9b5360d055 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -579,61 +579,57 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { return false; } - if (F->isConstexpr() && F->hasBody() && F->getDecl()->isConstexpr()) - return true; - - // Implicitly constexpr. - if (F->isLambdaStaticInvoker()) - return true; - - const SourceLocation &Loc = S.Current->getLocation(OpPC); - if (S.getLangOpts().CPlusPlus11) { - const FunctionDecl *DiagDecl = F->getDecl(); + if (!F->isConstexpr() || !F->hasBody()) { + const SourceLocation &Loc = S.Current->getLocation(OpPC); + if (S.getLangOpts().CPlusPlus11) { + const FunctionDecl *DiagDecl = F->getDecl(); - // Invalid decls have been diagnosed before. - if (DiagDecl->isInvalidDecl()) - return false; + // Invalid decls have been diagnosed before. + if (DiagDecl->isInvalidDecl()) + return false; - // If this function is not constexpr because it is an inherited - // non-constexpr constructor, diagnose that directly. - const auto *CD = dyn_cast(DiagDecl); - if (CD && CD->isInheritingConstructor()) { - const auto *Inherited = CD->getInheritedConstructor().getConstructor(); - if (!Inherited->isConstexpr()) - DiagDecl = CD = Inherited; - } + // If this function is not constexpr because it is an inherited + // non-constexpr constructor, diagnose that directly. + const auto *CD = dyn_cast(DiagDecl); + if (CD && CD->isInheritingConstructor()) { + const auto *Inherited = CD->getInheritedConstructor().getConstructor(); + if (!Inherited->isConstexpr()) + DiagDecl = CD = Inherited; + } - // FIXME: If DiagDecl is an implicitly-declared special member function - // or an inheriting constructor, we should be much more explicit about why - // it's not constexpr. - if (CD && CD->isInheritingConstructor()) { - S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) + // FIXME: If DiagDecl is an implicitly-declared special member function + // or an inheriting constructor, we should be much more explicit about why + // it's not constexpr. + if (CD && CD->isInheritingConstructor()) { + S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) << CD->getInheritedConstructor().getConstructor()->getParent(); - S.Note(DiagDecl->getLocation(), diag::note_declared_at); - } else { - // Don't emit anything if the function isn't defined and we're checking - // for a constant expression. It might be defined at the point we're - // actually calling it. - bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; - if (!DiagDecl->isDefined() && !IsExtern && DiagDecl->isConstexpr() && - S.checkingPotentialConstantExpression()) - return false; + S.Note(DiagDecl->getLocation(), diag::note_declared_at); + } else { + // Don't emit anything if the function isn't defined and we're checking + // for a constant expression. It might be defined at the point we're + // actually calling it. + bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; + if (!DiagDecl->isDefined() && !IsExtern && + S.checkingPotentialConstantExpression()) + return false; - // If the declaration is defined, declared 'constexpr' _and_ has a body, - // the below diagnostic doesn't add anything useful. - if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && - DiagDecl->hasBody()) - return false; + // If the declaration is defined, declared 'constexpr' _and_ has a body, + // the below diagnostic doesn't add anything useful. + if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && + DiagDecl->hasBody()) + return false; - S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) + S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) << DiagDecl->isConstexpr() << (bool)CD << DiagDecl; - S.Note(DiagDecl->getLocation(), diag::note_declared_at); + S.Note(DiagDecl->getLocation(), diag::note_declared_at); + } + } else { + S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); } - } else { - S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); + return false; } - return false; + return true; } bool CheckCallDepth(InterpState &S, CodePtr OpPC) { diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 2e159012f5ffd..17b3157cb40a9 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2531,14 +2531,14 @@ inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func, if (!CheckInvoke(S, OpPC, ThisPtr)) return false; } + + if (S.checkingPotentialConstantExpression()) + return false; } if (!CheckCallable(S, OpPC, Func)) return false; - if (Func->hasThisPointer() && S.checkingPotentialConstantExpression()) - return false; - if (!CheckCallDepth(S, OpPC)) return false; diff --git a/clang/test/AST/Interp/cxx2a.cpp b/clang/test/AST/Interp/cxx2a.cpp deleted file mode 100644 index 27d1aa1a27f75..0000000000000 --- a/clang/test/AST/Interp/cxx2a.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=ref,both %s -// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter - -template -struct S { - S() requires (N==1) = default; - S() requires (N==2) {} // both-note {{declared here}} - consteval S() requires (N==3) = default; -}; - -consteval int aConstevalFunction() { // both-error {{consteval function never produces a constant expression}} - S<2> s4; // both-note {{non-constexpr constructor 'S' cannot be used in a constant expression}} - return 0; -} -/// We're NOT calling the above function. The diagnostics should appear anyway. From a778909168746e266ad52b817a758328cdd28311 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 18 Jul 2024 13:31:47 +0000 Subject: [PATCH 018/486] [gn build] Port 4dfa75c663e5 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index b976e9745fbef..0349e636f8009 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -863,7 +863,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_referenceable.h", "__type_traits/is_same.h", "__type_traits/is_scalar.h", - "__type_traits/is_scoped_enum.h", "__type_traits/is_signed.h", "__type_traits/is_signed_integer.h", "__type_traits/is_specialization.h", From 684a61506a3ddc943b8baef1d14c96bbf82e6c04 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Thu, 18 Jul 2024 21:58:35 +0800 Subject: [PATCH 019/486] [libc++][chrono] Remove non-standard relational operators for `std::chrono::weekday` (#98730) These operators are absent in https://eel.is/c++draft/time.syn and a note in https://eel.is/c++draft/time.cal.wd.overview#1 indicates that the absence is intended. This patch removes the undocumented extension, while providing a migration path for vendors by providing the `_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS` macro. This macro will be honored for the LLVM 19 release and will be removed after that, at which point allocator will be removed unconditionally. --- libcxx/docs/ReleaseNotes/19.rst | 10 ++++++++++ libcxx/docs/ReleaseNotes/20.rst | 4 ++++ libcxx/include/__chrono/weekday.h | 3 +++ .../time.cal.weekday.nonmembers/comparisons.pass.cpp | 3 +++ 4 files changed, 20 insertions(+) diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index a28dae52e8579..36cb23dfde6c9 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -143,6 +143,12 @@ Deprecations and Removals of randomness, and others. Users that were checking whether including a header would fail (e.g. via a script or CMake's ``try_compile`` will experience a change in behavior). +- libc++ no longer supports relational comparison for ``std::chrono::weekday``. The relational comparison operators were + provided as an undocumented extension. If you were using relational comparison on ``std::chrono::weekday``, compare + the results of ``c_encoding()`` or ``iso_encoding()`` instead. The + ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro can be defined to temporarily re-enable this extension. + This macro will be honored for one release and ignored starting in LLVM 20. + - The operators in the ``rel_ops`` namespace have been deprecated. The deprecation is part of the paper P0768R1 "Library Support for the Spaceship (Comparison) Operator". @@ -157,6 +163,10 @@ LLVM 20 - The C++20 synchronization library will be removed entirely in language modes prior to C++20 in LLVM 20. +- The relational operators for ``std::chrono::weekday`` will be removed entirely, and the + ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be + ignored in LLVM 20. + LLVM 21 ~~~~~~~ TODO diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index 79b9788f92eda..fb677b1667ddc 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -55,6 +55,10 @@ Deprecations and Removals - TODO: The C++20 synchronization library will be removed entirely in language modes prior to C++20 in LLVM 20. +- TODO: The relational operators for ``std::chrono::weekday`` will be removed entirely, and the + ``_LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS`` macro that was used to re-enable this extension will be + ignored in LLVM 20. + Upcoming Deprecations and Removals ---------------------------------- diff --git a/libcxx/include/__chrono/weekday.h b/libcxx/include/__chrono/weekday.h index 5a7dedc6e3a16..86c780cc71825 100644 --- a/libcxx/include/__chrono/weekday.h +++ b/libcxx/include/__chrono/weekday.h @@ -79,6 +79,8 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator==(const weekday& __lhs, con return __lhs.c_encoding() == __rhs.c_encoding(); } +// TODO(LLVM 20): Remove the escape hatch +# ifdef _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<(const weekday& __lhs, const weekday& __rhs) noexcept { return __lhs.c_encoding() < __rhs.c_encoding(); } @@ -94,6 +96,7 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator<=(const weekday& __lhs, con _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(const weekday& __lhs, const weekday& __rhs) noexcept { return !(__lhs < __rhs); } +# endif // _LIBCPP_ENABLE_REMOVED_WEEKDAY_RELATIONAL_OPERATORS _LIBCPP_HIDE_FROM_ABI inline constexpr weekday operator+(const weekday& __lhs, const days& __rhs) noexcept { auto const __mu = static_cast(__lhs.c_encoding()) + __rhs.count(); diff --git a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp index 33d8cd9f776d2..0101d205b5555 100644 --- a/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp +++ b/libcxx/test/std/time/time.cal/time.cal.weekday/time.cal.weekday.nonmembers/comparisons.pass.cpp @@ -17,10 +17,13 @@ #include #include #include +#include #include "test_macros.h" #include "test_comparisons.h" +static_assert(!std::totally_ordered); + int main(int, char**) { using weekday = std::chrono::weekday; From 078198f310d55925ccd9e1aa5b6ff4af3b36bbc7 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 18 Jul 2024 15:04:02 +0100 Subject: [PATCH 020/486] [DebugInfo][InstrRef] Index DebugVariables and some DILocations (#99318) A lot of time in LiveDebugValues is spent computing DenseMap keys for DebugVariables, and they're made up of three pointers, so are large. This patch installs an index for them: for the SSA and value-to-location mapping parts of InstrRefBasedLDV we don't need to access things like the variable declaration or the inlining site, so just use a uint32_t identifier for each variable fragment that's tracked. The compile-time performance improvements are substantial (almost 0.4% on the tracker). About 80% of this patch is just replacing DebugVariable references with DebugVariableIDs instead, however there are some larger consequences. We spend lots of time fetching DILocations when emitting DBG_VALUE instructions, so index those with the DebugVariables: this means all DILocations on all new DBG_VALUE instructions will normalise to the first-seen DILocation for the variable (which should be fine). We also used to keep an ordering of when each variable was seen first in a DBG_* instruction, in the AllVarsNumbering collection, so that we can emit new DBG_* instructions in a stable order. We can hang this off the DebugVariable index instead, so AllVarsNumbering is deleted. Finally, rather than ordering by AllVarsNumbering just before DBG_* instructions are linked into the output MIR, store instructions along with their DebugVariableID, so that they can be sorted by that instead. --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 201 +++++++++--------- .../LiveDebugValues/InstrRefBasedImpl.h | 106 ++++++--- .../MIR/X86/live-debug-values-fragments.mir | 4 +- llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 3 +- 4 files changed, 186 insertions(+), 128 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 247258a1ff553..b9cf36a07846c 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -183,6 +183,7 @@ class TransferTracker { /// information from it. (XXX make it const?) MLocTracker *MTracker; MachineFunction &MF; + const DebugVariableMap &DVMap; bool ShouldEmitDebugEntryValues; /// Record of all changes in variable locations at a block position. Awkwardly @@ -191,7 +192,9 @@ class TransferTracker { struct Transfer { MachineBasicBlock::instr_iterator Pos; /// Position to insert DBG_VALUes MachineBasicBlock *MBB; /// non-null if we should insert after. - SmallVector Insts; /// Vector of DBG_VALUEs to insert. + /// Vector of DBG_VALUEs to insert. Store with their DebugVariableID so that + /// they can be sorted into a stable order for emission at a later time. + SmallVector, 4> Insts; }; /// Stores the resolved operands (machine locations and constants) and @@ -227,15 +230,15 @@ class TransferTracker { /// Map from LocIdxes to which DebugVariables are based that location. /// Mantained while stepping through the block. Not accurate if /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx]. - DenseMap> ActiveMLocs; + DenseMap> ActiveMLocs; /// Map from DebugVariable to it's current location and qualifying meta /// information. To be used in conjunction with ActiveMLocs to construct /// enough information for the DBG_VALUEs for a particular LocIdx. - DenseMap ActiveVLocs; + DenseMap ActiveVLocs; /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection. - SmallVector PendingDbgValues; + SmallVector, 4> PendingDbgValues; /// Record of a use-before-def: created when a value that's live-in to the /// current block isn't available in any machine location, but it will be @@ -244,12 +247,12 @@ class TransferTracker { /// Value of this variable, def'd in block. SmallVector Values; /// Identity of this variable. - DebugVariable Var; + DebugVariableID VarID; /// Additional variable properties. DbgValueProperties Properties; - UseBeforeDef(ArrayRef Values, const DebugVariable &Var, + UseBeforeDef(ArrayRef Values, DebugVariableID VarID, const DbgValueProperties &Properties) - : Values(Values.begin(), Values.end()), Var(Var), + : Values(Values.begin(), Values.end()), VarID(VarID), Properties(Properties) {} }; @@ -260,15 +263,16 @@ class TransferTracker { /// The set of variables that are in UseBeforeDefs and can become a location /// once the relevant value is defined. An element being erased from this /// collection prevents the use-before-def materializing. - DenseSet UseBeforeDefVariables; + DenseSet UseBeforeDefVariables; const TargetRegisterInfo &TRI; const BitVector &CalleeSavedRegs; TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker, - MachineFunction &MF, const TargetRegisterInfo &TRI, + MachineFunction &MF, const DebugVariableMap &DVMap, + const TargetRegisterInfo &TRI, const BitVector &CalleeSavedRegs, const TargetPassConfig &TPC) - : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI), + : TII(TII), MTracker(MTracker), MF(MF), DVMap(DVMap), TRI(TRI), CalleeSavedRegs(CalleeSavedRegs) { TLI = MF.getSubtarget().getTargetLowering(); auto &TM = TPC.getTM(); @@ -352,7 +356,7 @@ class TransferTracker { /// determine the values used by Value. void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore, const SmallVectorImpl &ValueToLoc, - DebugVariable Var, DbgValue Value) { + DebugVariableID VarID, DbgValue Value) { SmallVector DbgOps; SmallVector ResolvedDbgOps; bool IsValueValid = true; @@ -401,7 +405,7 @@ class TransferTracker { static_cast(Num.getInst())); continue; } - recoverAsEntryValue(Var, Value.Properties, Num); + recoverAsEntryValue(VarID, Value.Properties, Num); IsValueValid = false; break; } @@ -419,8 +423,7 @@ class TransferTracker { // Add UseBeforeDef entry for the last value to be defined in this block. if (LastUseBeforeDef) { - addUseBeforeDef(Var, Value.Properties, DbgOps, - LastUseBeforeDef); + addUseBeforeDef(VarID, Value.Properties, DbgOps, LastUseBeforeDef); return; } @@ -428,13 +431,15 @@ class TransferTracker { // the transfer. for (const ResolvedDbgOp &Op : ResolvedDbgOps) if (!Op.IsConst) - ActiveMLocs[Op.Loc].insert(Var); + ActiveMLocs[Op.Loc].insert(VarID); auto NewValue = ResolvedDbgValue{ResolvedDbgOps, Value.Properties}; - auto Result = ActiveVLocs.insert(std::make_pair(Var, NewValue)); + auto Result = ActiveVLocs.insert(std::make_pair(VarID, NewValue)); if (!Result.second) Result.first->second = NewValue; + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); PendingDbgValues.push_back( - MTracker->emitLoc(ResolvedDbgOps, Var, Value.Properties)); + std::make_pair(VarID, &*MTracker->emitLoc(ResolvedDbgOps, Var, DILoc, + Value.Properties))); } /// Load object with live-in variable values. \p mlocs contains the live-in @@ -445,7 +450,7 @@ class TransferTracker { /// FIXME: could just examine mloctracker instead of passing in \p mlocs? void loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs, DbgOpIDMap &DbgOpStore, - const SmallVectorImpl> &VLocs, + const SmallVectorImpl> &VLocs, unsigned NumLocs) { ActiveMLocs.clear(); ActiveVLocs.clear(); @@ -506,11 +511,11 @@ class TransferTracker { /// Record that \p Var has value \p ID, a value that becomes available /// later in the function. - void addUseBeforeDef(const DebugVariable &Var, + void addUseBeforeDef(DebugVariableID VarID, const DbgValueProperties &Properties, const SmallVectorImpl &DbgOps, unsigned Inst) { - UseBeforeDefs[Inst].emplace_back(DbgOps, Var, Properties); - UseBeforeDefVariables.insert(Var); + UseBeforeDefs[Inst].emplace_back(DbgOps, VarID, Properties); + UseBeforeDefVariables.insert(VarID); } /// After the instruction at index \p Inst and position \p pos has been @@ -529,7 +534,7 @@ class TransferTracker { // Populate ValueToLoc with illegal default mappings for every value used by // any UseBeforeDef variables for this instruction. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.Var)) + if (!UseBeforeDefVariables.count(Use.VarID)) continue; for (DbgOp &Op : Use.Values) { @@ -568,7 +573,7 @@ class TransferTracker { // Using the map of values to locations, produce a final set of values for // this variable. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.Var)) + if (!UseBeforeDefVariables.count(Use.VarID)) continue; SmallVector DbgOps; @@ -591,8 +596,9 @@ class TransferTracker { continue; // Otherwise, we're good to go. - PendingDbgValues.push_back( - MTracker->emitLoc(DbgOps, Use.Var, Use.Properties)); + auto &[Var, DILoc] = DVMap.lookupDVID(Use.VarID); + PendingDbgValues.push_back(std::make_pair( + Use.VarID, MTracker->emitLoc(DbgOps, Var, DILoc, Use.Properties))); } flushDbgValues(pos, nullptr); } @@ -642,7 +648,7 @@ class TransferTracker { return Reg != SP && Reg != FP; } - bool recoverAsEntryValue(const DebugVariable &Var, + bool recoverAsEntryValue(DebugVariableID VarID, const DbgValueProperties &Prop, const ValueIDNum &Num) { // Is this variable location a candidate to be an entry value. First, @@ -663,6 +669,8 @@ class TransferTracker { DIExpr = *NonVariadicExpression; } + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + // Is the variable appropriate for entry values (i.e., is a parameter). if (!isEntryValueVariable(Var, DIExpr)) return false; @@ -676,9 +684,8 @@ class TransferTracker { DIExpression::prepend(DIExpr, DIExpression::EntryValue); Register Reg = MTracker->LocIdxToLocID[Num.getLoc()]; MachineOperand MO = MachineOperand::CreateReg(Reg, false); - - PendingDbgValues.push_back( - emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false})); + PendingDbgValues.push_back(std::make_pair( + VarID, &*emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false}))); return true; } @@ -687,19 +694,20 @@ class TransferTracker { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); DbgValueProperties Properties(MI); + DebugVariableID VarID = DVMap.getDVID(Var); // Ignore non-register locations, we don't transfer those. if (MI.isUndefDebugValue() || all_of(MI.debug_operands(), [](const MachineOperand &MO) { return !MO.isReg(); })) { - auto It = ActiveVLocs.find(Var); + auto It = ActiveVLocs.find(VarID); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(Var); + ActiveMLocs[Loc].erase(VarID); ActiveVLocs.erase(It); } // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(Var); + UseBeforeDefVariables.erase(VarID); return; } @@ -725,14 +733,15 @@ class TransferTracker { SmallVectorImpl &NewLocs) { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); + DebugVariableID VarID = DVMap.getDVID(Var); // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(Var); + UseBeforeDefVariables.erase(VarID); // Erase any previous location. - auto It = ActiveVLocs.find(Var); + auto It = ActiveVLocs.find(VarID); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(Var); + ActiveMLocs[Loc].erase(VarID); } // If there _is_ no new location, all we had to do was erase. @@ -742,7 +751,7 @@ class TransferTracker { return; } - SmallVector> LostMLocs; + SmallVector> LostMLocs; for (ResolvedDbgOp &Op : NewLocs) { if (Op.IsConst) continue; @@ -769,17 +778,17 @@ class TransferTracker { for (const auto &LostMLoc : LostMLocs) ActiveMLocs[LostMLoc.first].erase(LostMLoc.second); LostMLocs.clear(); - It = ActiveVLocs.find(Var); + It = ActiveVLocs.find(VarID); ActiveMLocs[NewLoc.asU64()].clear(); VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc); } - ActiveMLocs[NewLoc].insert(Var); + ActiveMLocs[NewLoc].insert(VarID); } if (It == ActiveVLocs.end()) { ActiveVLocs.insert( - std::make_pair(Var, ResolvedDbgValue(NewLocs, Properties))); + std::make_pair(VarID, ResolvedDbgValue(NewLocs, Properties))); } else { It->second.Ops.assign(NewLocs); It->second.Properties = Properties; @@ -822,21 +831,21 @@ class TransferTracker { // explicitly undef, then stop here. if (!NewLoc && !MakeUndef) { // Try and recover a few more locations with entry values. - for (const auto &Var : ActiveMLocIt->second) { - auto &Prop = ActiveVLocs.find(Var)->second.Properties; - recoverAsEntryValue(Var, Prop, OldValue); + for (DebugVariableID VarID : ActiveMLocIt->second) { + auto &Prop = ActiveVLocs.find(VarID)->second.Properties; + recoverAsEntryValue(VarID, Prop, OldValue); } flushDbgValues(Pos, nullptr); return; } // Examine all the variables based on this location. - DenseSet NewMLocs; + DenseSet NewMLocs; // If no new location has been found, every variable that depends on this // MLoc is dead, so end their existing MLoc->Var mappings as well. - SmallVector> LostMLocs; - for (const auto &Var : ActiveMLocIt->second) { - auto ActiveVLocIt = ActiveVLocs.find(Var); + SmallVector> LostMLocs; + for (DebugVariableID VarID : ActiveMLocIt->second) { + auto ActiveVLocIt = ActiveVLocs.find(VarID); // Re-state the variable location: if there's no replacement then NewLoc // is std::nullopt and a $noreg DBG_VALUE will be created. Otherwise, a // DBG_VALUE identifying the alternative location will be emitted. @@ -855,19 +864,21 @@ class TransferTracker { replace_copy(ActiveVLocIt->second.Ops, DbgOps.begin(), OldOp, NewOp); } - PendingDbgValues.push_back(MTracker->emitLoc(DbgOps, Var, Properties)); + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + PendingDbgValues.push_back(std::make_pair( + VarID, &*MTracker->emitLoc(DbgOps, Var, DILoc, Properties))); // Update machine locations <=> variable locations maps. Defer updating // ActiveMLocs to avoid invalidating the ActiveMLocIt iterator. if (!NewLoc) { for (LocIdx Loc : ActiveVLocIt->second.loc_indices()) { if (Loc != MLoc) - LostMLocs.emplace_back(Loc, Var); + LostMLocs.emplace_back(Loc, VarID); } ActiveVLocs.erase(ActiveVLocIt); } else { ActiveVLocIt->second.Ops = DbgOps; - NewMLocs.insert(Var); + NewMLocs.insert(VarID); } } @@ -891,8 +902,8 @@ class TransferTracker { // Commit ActiveMLoc changes. ActiveMLocIt->second.clear(); if (!NewMLocs.empty()) - for (auto &Var : NewMLocs) - ActiveMLocs[*NewLoc].insert(Var); + for (DebugVariableID VarID : NewMLocs) + ActiveMLocs[*NewLoc].insert(VarID); } /// Transfer variables based on \p Src to be based on \p Dst. This handles @@ -915,17 +926,18 @@ class TransferTracker { // For each variable based on Src; create a location at Dst. ResolvedDbgOp SrcOp(Src); ResolvedDbgOp DstOp(Dst); - for (const auto &Var : MovingVars) { - auto ActiveVLocIt = ActiveVLocs.find(Var); + for (DebugVariableID VarID : MovingVars) { + auto ActiveVLocIt = ActiveVLocs.find(VarID); assert(ActiveVLocIt != ActiveVLocs.end()); // Update all instances of Src in the variable's tracked values to Dst. std::replace(ActiveVLocIt->second.Ops.begin(), ActiveVLocIt->second.Ops.end(), SrcOp, DstOp); - MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, DILoc, ActiveVLocIt->second.Properties); - PendingDbgValues.push_back(MI); + PendingDbgValues.push_back(std::make_pair(VarID, MI)); } ActiveMLocs[Src].clear(); flushDbgValues(Pos, nullptr); @@ -1176,11 +1188,9 @@ LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() { MachineInstrBuilder MLocTracker::emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, + const DebugVariable &Var, const DILocation *DILoc, const DbgValueProperties &Properties) { - DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0, - Var.getVariable()->getScope(), - const_cast(Var.getInlinedAt())); + DebugLoc DL = DebugLoc(DILoc); const MCInstrDesc &Desc = Properties.IsVariadic ? TII.get(TargetOpcode::DBG_VALUE_LIST) @@ -1726,7 +1736,8 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, LastUseBeforeDef = std::max(LastUseBeforeDef, NewID.getInst()); } if (IsValidUseBeforeDef) { - TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false, true}, + DebugVariableID VID = DVMap.insertDVID(V, MI.getDebugLoc().get()); + TTracker->addUseBeforeDef(VID, {MI.getDebugExpression(), false, true}, DbgOps, LastUseBeforeDef); } } @@ -1735,9 +1746,11 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, // This DBG_VALUE is potentially a $noreg / undefined location, if // FoundLoc is illegal. // (XXX -- could morph the DBG_INSTR_REF in the future). - MachineInstr *DbgMI = MTracker->emitLoc(NewLocs, V, Properties); + MachineInstr *DbgMI = + MTracker->emitLoc(NewLocs, V, MI.getDebugLoc().get(), Properties); + DebugVariableID ID = DVMap.getDVID(V); - TTracker->PendingDbgValues.push_back(DbgMI); + TTracker->PendingDbgValues.push_back(std::make_pair(ID, DbgMI)); TTracker->flushDbgValues(MI.getIterator(), nullptr); return true; } @@ -3112,7 +3125,8 @@ void InstrRefBasedLDV::getBlocksForScope( } void InstrRefBasedLDV::buildVLocValueMap( - const DILocation *DILoc, const SmallSet &VarsWeCareAbout, + const DILocation *DILoc, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs) { @@ -3188,7 +3202,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // between blocks. This keeps the locality of working on one lexical scope at // at time, but avoids re-processing variable values because some other // variable has been assigned. - for (const auto &Var : VarsWeCareAbout) { + for (DebugVariableID VarID : VarsWeCareAbout) { // Re-initialize live-ins and live-outs, to clear the remains of previous // variables live-ins / live-outs. for (unsigned int I = 0; I < NumBlocks; ++I) { @@ -3202,7 +3216,7 @@ void InstrRefBasedLDV::buildVLocValueMap( SmallPtrSet DefBlocks; for (const MachineBasicBlock *ExpMBB : BlocksToExplore) { auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars; - if (TransferFunc.contains(Var)) + if (TransferFunc.contains(VarID)) DefBlocks.insert(const_cast(ExpMBB)); } @@ -3212,7 +3226,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // only one value definition, things are very simple. if (DefBlocks.size() == 1) { placePHIsForSingleVarDefinition(MutBlocksToExplore, *DefBlocks.begin(), - AllTheVLocs, Var, Output); + AllTheVLocs, VarID, Output); continue; } @@ -3285,7 +3299,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // Do transfer function. auto &VTracker = AllTheVLocs[MBB->getNumber()]; - auto TransferIt = VTracker.Vars.find(Var); + auto TransferIt = VTracker.Vars.find(VarID); if (TransferIt != VTracker.Vars.end()) { // Erase on empty transfer (DBG_VALUE $noreg). if (TransferIt->second.Kind == DbgValue::Undef) { @@ -3347,9 +3361,11 @@ void InstrRefBasedLDV::buildVLocValueMap( continue; if (BlockLiveIn->Kind == DbgValue::VPHI) BlockLiveIn->Kind = DbgValue::Def; + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() == - Var.getFragment() && "Fragment info missing during value prop"); - Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn)); + Var.getFragment() && + "Fragment info missing during value prop"); + Output[MBB->getNumber()].push_back(std::make_pair(VarID, *BlockLiveIn)); } } // Per-variable loop. @@ -3360,7 +3376,7 @@ void InstrRefBasedLDV::buildVLocValueMap( void InstrRefBasedLDV::placePHIsForSingleVarDefinition( const SmallPtrSetImpl &InScopeBlocks, MachineBasicBlock *AssignMBB, SmallVectorImpl &AllTheVLocs, - const DebugVariable &Var, LiveInsT &Output) { + DebugVariableID VarID, LiveInsT &Output) { // If there is a single definition of the variable, then working out it's // value everywhere is very simple: it's every block dominated by the // definition. At the dominance frontier, the usual algorithm would: @@ -3373,7 +3389,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( // Pick out the variables value from the block transfer function. VLocTracker &VLocs = AllTheVLocs[AssignMBB->getNumber()]; - auto ValueIt = VLocs.Vars.find(Var); + auto ValueIt = VLocs.Vars.find(VarID); const DbgValue &Value = ValueIt->second; // If it's an explicit assignment of "undef", that means there is no location @@ -3388,7 +3404,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( if (!DomTree->properlyDominates(AssignMBB, ScopeBlock)) continue; - Output[ScopeBlock->getNumber()].push_back({Var, Value}); + Output[ScopeBlock->getNumber()].push_back({VarID, Value}); } // All blocks that aren't dominated have no live-in value, thus no variable @@ -3515,9 +3531,9 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs, MachineFunction &MF, - DenseMap &AllVarsNumbering, const TargetPassConfig &TPC) { - TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC); + TTracker = + new TransferTracker(TII, MTracker, MF, DVMap, *TRI, CalleeSavedRegs, TPC); unsigned NumLocs = MTracker->getNumLocs(); VTracker = nullptr; @@ -3622,31 +3638,24 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( if (MInLocs.hasTableFor(*MBB)) EjectBlock(*MBB); - return emitTransfers(AllVarsNumbering); + return emitTransfers(); } -bool InstrRefBasedLDV::emitTransfers( - DenseMap &AllVarsNumbering) { +bool InstrRefBasedLDV::emitTransfers() { // Go through all the transfers recorded in the TransferTracker -- this is // both the live-ins to a block, and any movements of values that happen // in the middle. - for (const auto &P : TTracker->Transfers) { + for (auto &P : TTracker->Transfers) { // We have to insert DBG_VALUEs in a consistent order, otherwise they // appear in DWARF in different orders. Use the order that they appear // when walking through each block / each instruction, stored in - // AllVarsNumbering. - SmallVector> Insts; - for (MachineInstr *MI : P.Insts) { - DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(), - MI->getDebugLoc()->getInlinedAt()); - Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI); - } - llvm::sort(Insts, llvm::less_first()); + // DVMap. + llvm::sort(P.Insts, llvm::less_first()); // Insert either before or after the designated point... if (P.MBB) { MachineBasicBlock &MBB = *P.MBB; - for (const auto &Pair : Insts) + for (const auto &Pair : P.Insts) MBB.insert(P.Pos, Pair.second); } else { // Terminators, like tail calls, can clobber things. Don't try and place @@ -3655,7 +3664,7 @@ bool InstrRefBasedLDV::emitTransfers( continue; MachineBasicBlock &MBB = *P.Pos->getParent(); - for (const auto &Pair : Insts) + for (const auto &Pair : P.Insts) MBB.insertAfterBundle(P.Pos, Pair.second); } } @@ -3710,7 +3719,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, initialSetup(MF); MLocTransfer.resize(MaxNumBlocks); - vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr)); + vlocs.resize(MaxNumBlocks, VLocTracker(DVMap, OverlapFragments, EmptyExpr)); SavedLiveIns.resize(MaxNumBlocks); produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks); @@ -3766,10 +3775,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, MTracker->reset(); } - // Number all variables in the order that they appear, to be used as a stable - // insertion order later. - DenseMap AllVarsNumbering; - // Map from one LexicalScope to all the variables in that scope. ScopeToVarsT ScopeToVars; @@ -3788,16 +3793,15 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, auto *VTracker = &vlocs[MBB->getNumber()]; // Collect each variable with a DBG_VALUE in this block. for (auto &idx : VTracker->Vars) { - const auto &Var = idx.first; - const DILocation *ScopeLoc = VTracker->Scopes[Var]; + DebugVariableID VarID = idx.first; + const DILocation *ScopeLoc = VTracker->Scopes[VarID]; assert(ScopeLoc != nullptr); auto *Scope = LS.findLexicalScope(ScopeLoc); // No insts in scope -> shouldn't have been recorded. assert(Scope != nullptr); - AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size())); - ScopeToVars[Scope].insert(Var); + ScopeToVars[Scope].insert(VarID); ScopeToAssignBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; ++VarAssignCount; @@ -3821,7 +3825,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // the "else" block of this condition. Changed = depthFirstVLocAndEmit( MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks, - SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC); + SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, *TPC); } delete MTracker; @@ -3840,6 +3844,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, SeenFragments.clear(); SeenDbgPHIs.clear(); DbgOpStore.clear(); + DVMap.clear(); return Changed; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 8770983481c2f..8c03e38eee062 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -35,6 +35,44 @@ class DbgOpIDMap; using namespace llvm; +using DebugVariableID = unsigned; +using VarAndLoc = std::pair; + +/// Mapping from DebugVariable to/from a unique identifying number. Each +/// DebugVariable consists of three pointers, and after a small amount of +/// work to identify overlapping fragments of variables we mostly only use +/// DebugVariables as identities of variables. It's much more compile-time +/// efficient to use an ID number instead, which this class provides. +class DebugVariableMap { + DenseMap VarToIdx; + SmallVector IdxToVar; + +public: + DebugVariableID getDVID(const DebugVariable &Var) const { + auto It = VarToIdx.find(Var); + assert(It != VarToIdx.end()); + return It->second; + } + + DebugVariableID insertDVID(DebugVariable &Var, const DILocation *Loc) { + unsigned Size = VarToIdx.size(); + auto ItPair = VarToIdx.insert({Var, Size}); + if (ItPair.second) { + IdxToVar.push_back({Var, Loc}); + return Size; + } + + return ItPair.first->second; + } + + const VarAndLoc &lookupDVID(DebugVariableID ID) const { return IdxToVar[ID]; } + + void clear() { + VarToIdx.clear(); + IdxToVar.clear(); + } +}; + /// Handle-class for a particular "location". This value-type uniquely /// symbolises a register or stack location, allowing manipulation of locations /// without concern for where that location is. Practically, this allows us to @@ -985,7 +1023,7 @@ class MLocTracker { /// information in \pProperties, for variable Var. Don't insert it anywhere, /// just return the builder for it. MachineInstrBuilder emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, + const DebugVariable &Var, const DILocation *DILoc, const DbgValueProperties &Properties); }; @@ -1003,38 +1041,45 @@ using OverlapMap = /// identified. class VLocTracker { public: + /// Ref to function-wide map of DebugVariable <=> ID-numbers. + DebugVariableMap &DVMap; /// Map DebugVariable to the latest Value it's defined to have. /// Needs to be a MapVector because we determine order-in-the-input-MIR from - /// the order in this container. + /// the order in this container. (FIXME: likely no longer true as the ordering + /// is now provided by DebugVariableMap). /// We only retain the last DbgValue in each block for each variable, to /// determine the blocks live-out variable value. The Vars container forms the /// transfer function for this block, as part of the dataflow analysis. The /// movement of values between locations inside of a block is handled at a /// much later stage, in the TransferTracker class. - MapVector Vars; - SmallDenseMap Scopes; + MapVector Vars; + SmallDenseMap Scopes; MachineBasicBlock *MBB = nullptr; const OverlapMap &OverlappingFragments; DbgValueProperties EmptyProperties; public: - VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr) - : OverlappingFragments(O), EmptyProperties(EmptyExpr, false, false) {} + VLocTracker(DebugVariableMap &DVMap, const OverlapMap &O, + const DIExpression *EmptyExpr) + : DVMap(DVMap), OverlappingFragments(O), + EmptyProperties(EmptyExpr, false, false) {} void defVar(const MachineInstr &MI, const DbgValueProperties &Properties, const SmallVectorImpl &DebugOps) { assert(MI.isDebugValueLike()); DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); + // Either insert or fetch an ID number for this variable. + DebugVariableID VarID = DVMap.insertDVID(Var, MI.getDebugLoc().get()); DbgValue Rec = (DebugOps.size() > 0) ? DbgValue(DebugOps, Properties) : DbgValue(Properties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(Var, Rec)); + auto Result = Vars.insert(std::make_pair(VarID, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[Var] = MI.getDebugLoc().get(); + Scopes[VarID] = MI.getDebugLoc().get(); considerOverlaps(Var, MI.getDebugLoc().get()); } @@ -1056,13 +1101,15 @@ class VLocTracker { DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo, Var.getInlinedAt()); + // Produce an ID number for this overlapping fragment of a variable. + DebugVariableID OverlappedID = DVMap.insertDVID(Overlapped, Loc); DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(Overlapped, Rec)); + auto Result = Vars.insert(std::make_pair(OverlappedID, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[Overlapped] = Loc; + Scopes[OverlappedID] = Loc; } } @@ -1093,7 +1140,7 @@ class InstrRefBasedLDV : public LDVImpl { /// variables to their values. using LiveIdxT = DenseMap; - using VarAndLoc = std::pair; + using VarAndLoc = std::pair; /// Type for a live-in value: the predecessor block, and its value. using InValueT = std::pair; @@ -1106,7 +1153,8 @@ class InstrRefBasedLDV : public LDVImpl { using ScopeToDILocT = DenseMap; /// Mapping from lexical scopes to variables in that scope. - using ScopeToVarsT = DenseMap>; + using ScopeToVarsT = + DenseMap>; /// Mapping from lexical scopes to blocks where variables in that scope are /// assigned. Such blocks aren't necessarily "in" the lexical scope, it's @@ -1200,6 +1248,11 @@ class InstrRefBasedLDV : public LDVImpl { DbgOpIDMap DbgOpStore; + /// Mapping between DebugVariables and unique ID numbers. This is a more + /// efficient way to represent the identity of a variable, versus a plain + /// DebugVariable. + DebugVariableMap DVMap; + /// True if we need to examine call instructions for stack clobbers. We /// normally assume that they don't clobber SP, but stack probes on Windows /// do. @@ -1330,9 +1383,9 @@ class InstrRefBasedLDV : public LDVImpl { /// performance as it doesn't have to find the dominance frontier between /// different assignments. void placePHIsForSingleVarDefinition( - const SmallPtrSetImpl &InScopeBlocks, - MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, - const DebugVariable &Var, LiveInsT &Output); + const SmallPtrSetImpl &InScopeBlocks, + MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, + DebugVariableID Var, LiveInsT &Output); /// Calculate the iterated-dominance-frontier for a set of defs, using the /// existing LLVM facilities for this. Works for a single "value" or @@ -1381,7 +1434,7 @@ class InstrRefBasedLDV : public LDVImpl { /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks /// locations through. void buildVLocValueMap(const DILocation *DILoc, - const SmallSet &VarsWeCareAbout, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, @@ -1414,10 +1467,8 @@ class InstrRefBasedLDV : public LDVImpl { const SmallVectorImpl &BlockOrders); /// Take collections of DBG_VALUE instructions stored in TTracker, and - /// install them into their output blocks. Preserves a stable order of - /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through - /// the AllVarsNumbering order. - bool emitTransfers(DenseMap &AllVarsNumbering); + /// install them into their output blocks. + bool emitTransfers(); /// Boilerplate computation of some initial sets, artifical blocks and /// RPOT block ordering. @@ -1437,13 +1488,14 @@ class InstrRefBasedLDV : public LDVImpl { /// block information can be fully computed before exploration finishes, /// allowing us to emit it and free data structures earlier than otherwise. /// It's also good for locality. - bool depthFirstVLocAndEmit( - unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation, - const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks, - LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, - SmallVectorImpl &AllTheVLocs, MachineFunction &MF, - DenseMap &AllVarsNumbering, - const TargetPassConfig &TPC); + bool depthFirstVLocAndEmit(unsigned MaxNumBlocks, + const ScopeToDILocT &ScopeToDILocation, + const ScopeToVarsT &ScopeToVars, + ScopeToAssignBlocksT &ScopeToBlocks, + LiveInsT &Output, FuncValueTable &MOutLocs, + FuncValueTable &MInLocs, + SmallVectorImpl &AllTheVLocs, + MachineFunction &MF, const TargetPassConfig &TPC); bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree, TargetPassConfig *TPC, unsigned InputBBLimit, diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir index b54c748ac9e84..67bfd85dcb379 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir @@ -17,12 +17,12 @@ # CHECK-LABEL: bb.3.bb3: # CHECK: DBG_VALUE $ecx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 0, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 0, 32) # CHECK-SAME: $ecx, $r8d # CHECK-NEXT: DBG_VALUE $ebx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 32, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 32, 32) # CHECK-SAME: $ebx, $r10d # CHECK-NEXT: XOR32rr diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index 306a97c3149cc..50a8cb97ae061 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -55,6 +55,7 @@ class InstrRefLDVTest : public testing::Test { DIBasicType *LongInt; DIExpression *EmptyExpr; LiveDebugValues::OverlapMap Overlaps; + LiveDebugValues::DebugVariableMap DVMap; DebugLoc OutermostLoc, InBlockLoc, NotNestedBlockLoc, InlinedLoc; @@ -176,7 +177,7 @@ class InstrRefLDVTest : public testing::Test { void addVTracker() { ASSERT_TRUE(LDV); - VTracker = std::make_unique(Overlaps, EmptyExpr); + VTracker = std::make_unique(DVMap, Overlaps, EmptyExpr); LDV->VTracker = &*VTracker; } From 50b657c8f655a86826e94131729b0f13a58acbca Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 18 Jul 2024 15:05:57 +0100 Subject: [PATCH 021/486] Revert "[DebugInfo][InstrRef] Index DebugVariables and some DILocations (#99318)" This reverts commit 078198f310d55925ccd9e1aa5b6ff4af3b36bbc7. Buildbots unhappy, I must have fluffed it --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 201 +++++++++--------- .../LiveDebugValues/InstrRefBasedImpl.h | 106 +++------ .../MIR/X86/live-debug-values-fragments.mir | 4 +- llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 3 +- 4 files changed, 128 insertions(+), 186 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index b9cf36a07846c..247258a1ff553 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -183,7 +183,6 @@ class TransferTracker { /// information from it. (XXX make it const?) MLocTracker *MTracker; MachineFunction &MF; - const DebugVariableMap &DVMap; bool ShouldEmitDebugEntryValues; /// Record of all changes in variable locations at a block position. Awkwardly @@ -192,9 +191,7 @@ class TransferTracker { struct Transfer { MachineBasicBlock::instr_iterator Pos; /// Position to insert DBG_VALUes MachineBasicBlock *MBB; /// non-null if we should insert after. - /// Vector of DBG_VALUEs to insert. Store with their DebugVariableID so that - /// they can be sorted into a stable order for emission at a later time. - SmallVector, 4> Insts; + SmallVector Insts; /// Vector of DBG_VALUEs to insert. }; /// Stores the resolved operands (machine locations and constants) and @@ -230,15 +227,15 @@ class TransferTracker { /// Map from LocIdxes to which DebugVariables are based that location. /// Mantained while stepping through the block. Not accurate if /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx]. - DenseMap> ActiveMLocs; + DenseMap> ActiveMLocs; /// Map from DebugVariable to it's current location and qualifying meta /// information. To be used in conjunction with ActiveMLocs to construct /// enough information for the DBG_VALUEs for a particular LocIdx. - DenseMap ActiveVLocs; + DenseMap ActiveVLocs; /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection. - SmallVector, 4> PendingDbgValues; + SmallVector PendingDbgValues; /// Record of a use-before-def: created when a value that's live-in to the /// current block isn't available in any machine location, but it will be @@ -247,12 +244,12 @@ class TransferTracker { /// Value of this variable, def'd in block. SmallVector Values; /// Identity of this variable. - DebugVariableID VarID; + DebugVariable Var; /// Additional variable properties. DbgValueProperties Properties; - UseBeforeDef(ArrayRef Values, DebugVariableID VarID, + UseBeforeDef(ArrayRef Values, const DebugVariable &Var, const DbgValueProperties &Properties) - : Values(Values.begin(), Values.end()), VarID(VarID), + : Values(Values.begin(), Values.end()), Var(Var), Properties(Properties) {} }; @@ -263,16 +260,15 @@ class TransferTracker { /// The set of variables that are in UseBeforeDefs and can become a location /// once the relevant value is defined. An element being erased from this /// collection prevents the use-before-def materializing. - DenseSet UseBeforeDefVariables; + DenseSet UseBeforeDefVariables; const TargetRegisterInfo &TRI; const BitVector &CalleeSavedRegs; TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker, - MachineFunction &MF, const DebugVariableMap &DVMap, - const TargetRegisterInfo &TRI, + MachineFunction &MF, const TargetRegisterInfo &TRI, const BitVector &CalleeSavedRegs, const TargetPassConfig &TPC) - : TII(TII), MTracker(MTracker), MF(MF), DVMap(DVMap), TRI(TRI), + : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI), CalleeSavedRegs(CalleeSavedRegs) { TLI = MF.getSubtarget().getTargetLowering(); auto &TM = TPC.getTM(); @@ -356,7 +352,7 @@ class TransferTracker { /// determine the values used by Value. void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore, const SmallVectorImpl &ValueToLoc, - DebugVariableID VarID, DbgValue Value) { + DebugVariable Var, DbgValue Value) { SmallVector DbgOps; SmallVector ResolvedDbgOps; bool IsValueValid = true; @@ -405,7 +401,7 @@ class TransferTracker { static_cast(Num.getInst())); continue; } - recoverAsEntryValue(VarID, Value.Properties, Num); + recoverAsEntryValue(Var, Value.Properties, Num); IsValueValid = false; break; } @@ -423,7 +419,8 @@ class TransferTracker { // Add UseBeforeDef entry for the last value to be defined in this block. if (LastUseBeforeDef) { - addUseBeforeDef(VarID, Value.Properties, DbgOps, LastUseBeforeDef); + addUseBeforeDef(Var, Value.Properties, DbgOps, + LastUseBeforeDef); return; } @@ -431,15 +428,13 @@ class TransferTracker { // the transfer. for (const ResolvedDbgOp &Op : ResolvedDbgOps) if (!Op.IsConst) - ActiveMLocs[Op.Loc].insert(VarID); + ActiveMLocs[Op.Loc].insert(Var); auto NewValue = ResolvedDbgValue{ResolvedDbgOps, Value.Properties}; - auto Result = ActiveVLocs.insert(std::make_pair(VarID, NewValue)); + auto Result = ActiveVLocs.insert(std::make_pair(Var, NewValue)); if (!Result.second) Result.first->second = NewValue; - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); PendingDbgValues.push_back( - std::make_pair(VarID, &*MTracker->emitLoc(ResolvedDbgOps, Var, DILoc, - Value.Properties))); + MTracker->emitLoc(ResolvedDbgOps, Var, Value.Properties)); } /// Load object with live-in variable values. \p mlocs contains the live-in @@ -450,7 +445,7 @@ class TransferTracker { /// FIXME: could just examine mloctracker instead of passing in \p mlocs? void loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs, DbgOpIDMap &DbgOpStore, - const SmallVectorImpl> &VLocs, + const SmallVectorImpl> &VLocs, unsigned NumLocs) { ActiveMLocs.clear(); ActiveVLocs.clear(); @@ -511,11 +506,11 @@ class TransferTracker { /// Record that \p Var has value \p ID, a value that becomes available /// later in the function. - void addUseBeforeDef(DebugVariableID VarID, + void addUseBeforeDef(const DebugVariable &Var, const DbgValueProperties &Properties, const SmallVectorImpl &DbgOps, unsigned Inst) { - UseBeforeDefs[Inst].emplace_back(DbgOps, VarID, Properties); - UseBeforeDefVariables.insert(VarID); + UseBeforeDefs[Inst].emplace_back(DbgOps, Var, Properties); + UseBeforeDefVariables.insert(Var); } /// After the instruction at index \p Inst and position \p pos has been @@ -534,7 +529,7 @@ class TransferTracker { // Populate ValueToLoc with illegal default mappings for every value used by // any UseBeforeDef variables for this instruction. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.VarID)) + if (!UseBeforeDefVariables.count(Use.Var)) continue; for (DbgOp &Op : Use.Values) { @@ -573,7 +568,7 @@ class TransferTracker { // Using the map of values to locations, produce a final set of values for // this variable. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.VarID)) + if (!UseBeforeDefVariables.count(Use.Var)) continue; SmallVector DbgOps; @@ -596,9 +591,8 @@ class TransferTracker { continue; // Otherwise, we're good to go. - auto &[Var, DILoc] = DVMap.lookupDVID(Use.VarID); - PendingDbgValues.push_back(std::make_pair( - Use.VarID, MTracker->emitLoc(DbgOps, Var, DILoc, Use.Properties))); + PendingDbgValues.push_back( + MTracker->emitLoc(DbgOps, Use.Var, Use.Properties)); } flushDbgValues(pos, nullptr); } @@ -648,7 +642,7 @@ class TransferTracker { return Reg != SP && Reg != FP; } - bool recoverAsEntryValue(DebugVariableID VarID, + bool recoverAsEntryValue(const DebugVariable &Var, const DbgValueProperties &Prop, const ValueIDNum &Num) { // Is this variable location a candidate to be an entry value. First, @@ -669,8 +663,6 @@ class TransferTracker { DIExpr = *NonVariadicExpression; } - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); - // Is the variable appropriate for entry values (i.e., is a parameter). if (!isEntryValueVariable(Var, DIExpr)) return false; @@ -684,8 +676,9 @@ class TransferTracker { DIExpression::prepend(DIExpr, DIExpression::EntryValue); Register Reg = MTracker->LocIdxToLocID[Num.getLoc()]; MachineOperand MO = MachineOperand::CreateReg(Reg, false); - PendingDbgValues.push_back(std::make_pair( - VarID, &*emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false}))); + + PendingDbgValues.push_back( + emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false})); return true; } @@ -694,20 +687,19 @@ class TransferTracker { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); DbgValueProperties Properties(MI); - DebugVariableID VarID = DVMap.getDVID(Var); // Ignore non-register locations, we don't transfer those. if (MI.isUndefDebugValue() || all_of(MI.debug_operands(), [](const MachineOperand &MO) { return !MO.isReg(); })) { - auto It = ActiveVLocs.find(VarID); + auto It = ActiveVLocs.find(Var); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(VarID); + ActiveMLocs[Loc].erase(Var); ActiveVLocs.erase(It); } // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(VarID); + UseBeforeDefVariables.erase(Var); return; } @@ -733,15 +725,14 @@ class TransferTracker { SmallVectorImpl &NewLocs) { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); - DebugVariableID VarID = DVMap.getDVID(Var); // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(VarID); + UseBeforeDefVariables.erase(Var); // Erase any previous location. - auto It = ActiveVLocs.find(VarID); + auto It = ActiveVLocs.find(Var); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(VarID); + ActiveMLocs[Loc].erase(Var); } // If there _is_ no new location, all we had to do was erase. @@ -751,7 +742,7 @@ class TransferTracker { return; } - SmallVector> LostMLocs; + SmallVector> LostMLocs; for (ResolvedDbgOp &Op : NewLocs) { if (Op.IsConst) continue; @@ -778,17 +769,17 @@ class TransferTracker { for (const auto &LostMLoc : LostMLocs) ActiveMLocs[LostMLoc.first].erase(LostMLoc.second); LostMLocs.clear(); - It = ActiveVLocs.find(VarID); + It = ActiveVLocs.find(Var); ActiveMLocs[NewLoc.asU64()].clear(); VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc); } - ActiveMLocs[NewLoc].insert(VarID); + ActiveMLocs[NewLoc].insert(Var); } if (It == ActiveVLocs.end()) { ActiveVLocs.insert( - std::make_pair(VarID, ResolvedDbgValue(NewLocs, Properties))); + std::make_pair(Var, ResolvedDbgValue(NewLocs, Properties))); } else { It->second.Ops.assign(NewLocs); It->second.Properties = Properties; @@ -831,21 +822,21 @@ class TransferTracker { // explicitly undef, then stop here. if (!NewLoc && !MakeUndef) { // Try and recover a few more locations with entry values. - for (DebugVariableID VarID : ActiveMLocIt->second) { - auto &Prop = ActiveVLocs.find(VarID)->second.Properties; - recoverAsEntryValue(VarID, Prop, OldValue); + for (const auto &Var : ActiveMLocIt->second) { + auto &Prop = ActiveVLocs.find(Var)->second.Properties; + recoverAsEntryValue(Var, Prop, OldValue); } flushDbgValues(Pos, nullptr); return; } // Examine all the variables based on this location. - DenseSet NewMLocs; + DenseSet NewMLocs; // If no new location has been found, every variable that depends on this // MLoc is dead, so end their existing MLoc->Var mappings as well. - SmallVector> LostMLocs; - for (DebugVariableID VarID : ActiveMLocIt->second) { - auto ActiveVLocIt = ActiveVLocs.find(VarID); + SmallVector> LostMLocs; + for (const auto &Var : ActiveMLocIt->second) { + auto ActiveVLocIt = ActiveVLocs.find(Var); // Re-state the variable location: if there's no replacement then NewLoc // is std::nullopt and a $noreg DBG_VALUE will be created. Otherwise, a // DBG_VALUE identifying the alternative location will be emitted. @@ -864,21 +855,19 @@ class TransferTracker { replace_copy(ActiveVLocIt->second.Ops, DbgOps.begin(), OldOp, NewOp); } - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); - PendingDbgValues.push_back(std::make_pair( - VarID, &*MTracker->emitLoc(DbgOps, Var, DILoc, Properties))); + PendingDbgValues.push_back(MTracker->emitLoc(DbgOps, Var, Properties)); // Update machine locations <=> variable locations maps. Defer updating // ActiveMLocs to avoid invalidating the ActiveMLocIt iterator. if (!NewLoc) { for (LocIdx Loc : ActiveVLocIt->second.loc_indices()) { if (Loc != MLoc) - LostMLocs.emplace_back(Loc, VarID); + LostMLocs.emplace_back(Loc, Var); } ActiveVLocs.erase(ActiveVLocIt); } else { ActiveVLocIt->second.Ops = DbgOps; - NewMLocs.insert(VarID); + NewMLocs.insert(Var); } } @@ -902,8 +891,8 @@ class TransferTracker { // Commit ActiveMLoc changes. ActiveMLocIt->second.clear(); if (!NewMLocs.empty()) - for (DebugVariableID VarID : NewMLocs) - ActiveMLocs[*NewLoc].insert(VarID); + for (auto &Var : NewMLocs) + ActiveMLocs[*NewLoc].insert(Var); } /// Transfer variables based on \p Src to be based on \p Dst. This handles @@ -926,18 +915,17 @@ class TransferTracker { // For each variable based on Src; create a location at Dst. ResolvedDbgOp SrcOp(Src); ResolvedDbgOp DstOp(Dst); - for (DebugVariableID VarID : MovingVars) { - auto ActiveVLocIt = ActiveVLocs.find(VarID); + for (const auto &Var : MovingVars) { + auto ActiveVLocIt = ActiveVLocs.find(Var); assert(ActiveVLocIt != ActiveVLocs.end()); // Update all instances of Src in the variable's tracked values to Dst. std::replace(ActiveVLocIt->second.Ops.begin(), ActiveVLocIt->second.Ops.end(), SrcOp, DstOp); - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); - MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, DILoc, + MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, ActiveVLocIt->second.Properties); - PendingDbgValues.push_back(std::make_pair(VarID, MI)); + PendingDbgValues.push_back(MI); } ActiveMLocs[Src].clear(); flushDbgValues(Pos, nullptr); @@ -1188,9 +1176,11 @@ LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() { MachineInstrBuilder MLocTracker::emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, const DILocation *DILoc, + const DebugVariable &Var, const DbgValueProperties &Properties) { - DebugLoc DL = DebugLoc(DILoc); + DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0, + Var.getVariable()->getScope(), + const_cast(Var.getInlinedAt())); const MCInstrDesc &Desc = Properties.IsVariadic ? TII.get(TargetOpcode::DBG_VALUE_LIST) @@ -1736,8 +1726,7 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, LastUseBeforeDef = std::max(LastUseBeforeDef, NewID.getInst()); } if (IsValidUseBeforeDef) { - DebugVariableID VID = DVMap.insertDVID(V, MI.getDebugLoc().get()); - TTracker->addUseBeforeDef(VID, {MI.getDebugExpression(), false, true}, + TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false, true}, DbgOps, LastUseBeforeDef); } } @@ -1746,11 +1735,9 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, // This DBG_VALUE is potentially a $noreg / undefined location, if // FoundLoc is illegal. // (XXX -- could morph the DBG_INSTR_REF in the future). - MachineInstr *DbgMI = - MTracker->emitLoc(NewLocs, V, MI.getDebugLoc().get(), Properties); - DebugVariableID ID = DVMap.getDVID(V); + MachineInstr *DbgMI = MTracker->emitLoc(NewLocs, V, Properties); - TTracker->PendingDbgValues.push_back(std::make_pair(ID, DbgMI)); + TTracker->PendingDbgValues.push_back(DbgMI); TTracker->flushDbgValues(MI.getIterator(), nullptr); return true; } @@ -3125,8 +3112,7 @@ void InstrRefBasedLDV::getBlocksForScope( } void InstrRefBasedLDV::buildVLocValueMap( - const DILocation *DILoc, - const SmallSet &VarsWeCareAbout, + const DILocation *DILoc, const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs) { @@ -3202,7 +3188,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // between blocks. This keeps the locality of working on one lexical scope at // at time, but avoids re-processing variable values because some other // variable has been assigned. - for (DebugVariableID VarID : VarsWeCareAbout) { + for (const auto &Var : VarsWeCareAbout) { // Re-initialize live-ins and live-outs, to clear the remains of previous // variables live-ins / live-outs. for (unsigned int I = 0; I < NumBlocks; ++I) { @@ -3216,7 +3202,7 @@ void InstrRefBasedLDV::buildVLocValueMap( SmallPtrSet DefBlocks; for (const MachineBasicBlock *ExpMBB : BlocksToExplore) { auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars; - if (TransferFunc.contains(VarID)) + if (TransferFunc.contains(Var)) DefBlocks.insert(const_cast(ExpMBB)); } @@ -3226,7 +3212,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // only one value definition, things are very simple. if (DefBlocks.size() == 1) { placePHIsForSingleVarDefinition(MutBlocksToExplore, *DefBlocks.begin(), - AllTheVLocs, VarID, Output); + AllTheVLocs, Var, Output); continue; } @@ -3299,7 +3285,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // Do transfer function. auto &VTracker = AllTheVLocs[MBB->getNumber()]; - auto TransferIt = VTracker.Vars.find(VarID); + auto TransferIt = VTracker.Vars.find(Var); if (TransferIt != VTracker.Vars.end()) { // Erase on empty transfer (DBG_VALUE $noreg). if (TransferIt->second.Kind == DbgValue::Undef) { @@ -3361,11 +3347,9 @@ void InstrRefBasedLDV::buildVLocValueMap( continue; if (BlockLiveIn->Kind == DbgValue::VPHI) BlockLiveIn->Kind = DbgValue::Def; - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() == - Var.getFragment() && - "Fragment info missing during value prop"); - Output[MBB->getNumber()].push_back(std::make_pair(VarID, *BlockLiveIn)); + Var.getFragment() && "Fragment info missing during value prop"); + Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn)); } } // Per-variable loop. @@ -3376,7 +3360,7 @@ void InstrRefBasedLDV::buildVLocValueMap( void InstrRefBasedLDV::placePHIsForSingleVarDefinition( const SmallPtrSetImpl &InScopeBlocks, MachineBasicBlock *AssignMBB, SmallVectorImpl &AllTheVLocs, - DebugVariableID VarID, LiveInsT &Output) { + const DebugVariable &Var, LiveInsT &Output) { // If there is a single definition of the variable, then working out it's // value everywhere is very simple: it's every block dominated by the // definition. At the dominance frontier, the usual algorithm would: @@ -3389,7 +3373,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( // Pick out the variables value from the block transfer function. VLocTracker &VLocs = AllTheVLocs[AssignMBB->getNumber()]; - auto ValueIt = VLocs.Vars.find(VarID); + auto ValueIt = VLocs.Vars.find(Var); const DbgValue &Value = ValueIt->second; // If it's an explicit assignment of "undef", that means there is no location @@ -3404,7 +3388,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( if (!DomTree->properlyDominates(AssignMBB, ScopeBlock)) continue; - Output[ScopeBlock->getNumber()].push_back({VarID, Value}); + Output[ScopeBlock->getNumber()].push_back({Var, Value}); } // All blocks that aren't dominated have no live-in value, thus no variable @@ -3531,9 +3515,9 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs, MachineFunction &MF, + DenseMap &AllVarsNumbering, const TargetPassConfig &TPC) { - TTracker = - new TransferTracker(TII, MTracker, MF, DVMap, *TRI, CalleeSavedRegs, TPC); + TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC); unsigned NumLocs = MTracker->getNumLocs(); VTracker = nullptr; @@ -3638,24 +3622,31 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( if (MInLocs.hasTableFor(*MBB)) EjectBlock(*MBB); - return emitTransfers(); + return emitTransfers(AllVarsNumbering); } -bool InstrRefBasedLDV::emitTransfers() { +bool InstrRefBasedLDV::emitTransfers( + DenseMap &AllVarsNumbering) { // Go through all the transfers recorded in the TransferTracker -- this is // both the live-ins to a block, and any movements of values that happen // in the middle. - for (auto &P : TTracker->Transfers) { + for (const auto &P : TTracker->Transfers) { // We have to insert DBG_VALUEs in a consistent order, otherwise they // appear in DWARF in different orders. Use the order that they appear // when walking through each block / each instruction, stored in - // DVMap. - llvm::sort(P.Insts, llvm::less_first()); + // AllVarsNumbering. + SmallVector> Insts; + for (MachineInstr *MI : P.Insts) { + DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(), + MI->getDebugLoc()->getInlinedAt()); + Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI); + } + llvm::sort(Insts, llvm::less_first()); // Insert either before or after the designated point... if (P.MBB) { MachineBasicBlock &MBB = *P.MBB; - for (const auto &Pair : P.Insts) + for (const auto &Pair : Insts) MBB.insert(P.Pos, Pair.second); } else { // Terminators, like tail calls, can clobber things. Don't try and place @@ -3664,7 +3655,7 @@ bool InstrRefBasedLDV::emitTransfers() { continue; MachineBasicBlock &MBB = *P.Pos->getParent(); - for (const auto &Pair : P.Insts) + for (const auto &Pair : Insts) MBB.insertAfterBundle(P.Pos, Pair.second); } } @@ -3719,7 +3710,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, initialSetup(MF); MLocTransfer.resize(MaxNumBlocks); - vlocs.resize(MaxNumBlocks, VLocTracker(DVMap, OverlapFragments, EmptyExpr)); + vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr)); SavedLiveIns.resize(MaxNumBlocks); produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks); @@ -3775,6 +3766,10 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, MTracker->reset(); } + // Number all variables in the order that they appear, to be used as a stable + // insertion order later. + DenseMap AllVarsNumbering; + // Map from one LexicalScope to all the variables in that scope. ScopeToVarsT ScopeToVars; @@ -3793,15 +3788,16 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, auto *VTracker = &vlocs[MBB->getNumber()]; // Collect each variable with a DBG_VALUE in this block. for (auto &idx : VTracker->Vars) { - DebugVariableID VarID = idx.first; - const DILocation *ScopeLoc = VTracker->Scopes[VarID]; + const auto &Var = idx.first; + const DILocation *ScopeLoc = VTracker->Scopes[Var]; assert(ScopeLoc != nullptr); auto *Scope = LS.findLexicalScope(ScopeLoc); // No insts in scope -> shouldn't have been recorded. assert(Scope != nullptr); - ScopeToVars[Scope].insert(VarID); + AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size())); + ScopeToVars[Scope].insert(Var); ScopeToAssignBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; ++VarAssignCount; @@ -3825,7 +3821,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // the "else" block of this condition. Changed = depthFirstVLocAndEmit( MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks, - SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, *TPC); + SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC); } delete MTracker; @@ -3844,7 +3840,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, SeenFragments.clear(); SeenDbgPHIs.clear(); DbgOpStore.clear(); - DVMap.clear(); return Changed; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 8c03e38eee062..8770983481c2f 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -35,44 +35,6 @@ class DbgOpIDMap; using namespace llvm; -using DebugVariableID = unsigned; -using VarAndLoc = std::pair; - -/// Mapping from DebugVariable to/from a unique identifying number. Each -/// DebugVariable consists of three pointers, and after a small amount of -/// work to identify overlapping fragments of variables we mostly only use -/// DebugVariables as identities of variables. It's much more compile-time -/// efficient to use an ID number instead, which this class provides. -class DebugVariableMap { - DenseMap VarToIdx; - SmallVector IdxToVar; - -public: - DebugVariableID getDVID(const DebugVariable &Var) const { - auto It = VarToIdx.find(Var); - assert(It != VarToIdx.end()); - return It->second; - } - - DebugVariableID insertDVID(DebugVariable &Var, const DILocation *Loc) { - unsigned Size = VarToIdx.size(); - auto ItPair = VarToIdx.insert({Var, Size}); - if (ItPair.second) { - IdxToVar.push_back({Var, Loc}); - return Size; - } - - return ItPair.first->second; - } - - const VarAndLoc &lookupDVID(DebugVariableID ID) const { return IdxToVar[ID]; } - - void clear() { - VarToIdx.clear(); - IdxToVar.clear(); - } -}; - /// Handle-class for a particular "location". This value-type uniquely /// symbolises a register or stack location, allowing manipulation of locations /// without concern for where that location is. Practically, this allows us to @@ -1023,7 +985,7 @@ class MLocTracker { /// information in \pProperties, for variable Var. Don't insert it anywhere, /// just return the builder for it. MachineInstrBuilder emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, const DILocation *DILoc, + const DebugVariable &Var, const DbgValueProperties &Properties); }; @@ -1041,45 +1003,38 @@ using OverlapMap = /// identified. class VLocTracker { public: - /// Ref to function-wide map of DebugVariable <=> ID-numbers. - DebugVariableMap &DVMap; /// Map DebugVariable to the latest Value it's defined to have. /// Needs to be a MapVector because we determine order-in-the-input-MIR from - /// the order in this container. (FIXME: likely no longer true as the ordering - /// is now provided by DebugVariableMap). + /// the order in this container. /// We only retain the last DbgValue in each block for each variable, to /// determine the blocks live-out variable value. The Vars container forms the /// transfer function for this block, as part of the dataflow analysis. The /// movement of values between locations inside of a block is handled at a /// much later stage, in the TransferTracker class. - MapVector Vars; - SmallDenseMap Scopes; + MapVector Vars; + SmallDenseMap Scopes; MachineBasicBlock *MBB = nullptr; const OverlapMap &OverlappingFragments; DbgValueProperties EmptyProperties; public: - VLocTracker(DebugVariableMap &DVMap, const OverlapMap &O, - const DIExpression *EmptyExpr) - : DVMap(DVMap), OverlappingFragments(O), - EmptyProperties(EmptyExpr, false, false) {} + VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr) + : OverlappingFragments(O), EmptyProperties(EmptyExpr, false, false) {} void defVar(const MachineInstr &MI, const DbgValueProperties &Properties, const SmallVectorImpl &DebugOps) { assert(MI.isDebugValueLike()); DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); - // Either insert or fetch an ID number for this variable. - DebugVariableID VarID = DVMap.insertDVID(Var, MI.getDebugLoc().get()); DbgValue Rec = (DebugOps.size() > 0) ? DbgValue(DebugOps, Properties) : DbgValue(Properties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(VarID, Rec)); + auto Result = Vars.insert(std::make_pair(Var, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[VarID] = MI.getDebugLoc().get(); + Scopes[Var] = MI.getDebugLoc().get(); considerOverlaps(Var, MI.getDebugLoc().get()); } @@ -1101,15 +1056,13 @@ class VLocTracker { DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo, Var.getInlinedAt()); - // Produce an ID number for this overlapping fragment of a variable. - DebugVariableID OverlappedID = DVMap.insertDVID(Overlapped, Loc); DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(OverlappedID, Rec)); + auto Result = Vars.insert(std::make_pair(Overlapped, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[OverlappedID] = Loc; + Scopes[Overlapped] = Loc; } } @@ -1140,7 +1093,7 @@ class InstrRefBasedLDV : public LDVImpl { /// variables to their values. using LiveIdxT = DenseMap; - using VarAndLoc = std::pair; + using VarAndLoc = std::pair; /// Type for a live-in value: the predecessor block, and its value. using InValueT = std::pair; @@ -1153,8 +1106,7 @@ class InstrRefBasedLDV : public LDVImpl { using ScopeToDILocT = DenseMap; /// Mapping from lexical scopes to variables in that scope. - using ScopeToVarsT = - DenseMap>; + using ScopeToVarsT = DenseMap>; /// Mapping from lexical scopes to blocks where variables in that scope are /// assigned. Such blocks aren't necessarily "in" the lexical scope, it's @@ -1248,11 +1200,6 @@ class InstrRefBasedLDV : public LDVImpl { DbgOpIDMap DbgOpStore; - /// Mapping between DebugVariables and unique ID numbers. This is a more - /// efficient way to represent the identity of a variable, versus a plain - /// DebugVariable. - DebugVariableMap DVMap; - /// True if we need to examine call instructions for stack clobbers. We /// normally assume that they don't clobber SP, but stack probes on Windows /// do. @@ -1383,9 +1330,9 @@ class InstrRefBasedLDV : public LDVImpl { /// performance as it doesn't have to find the dominance frontier between /// different assignments. void placePHIsForSingleVarDefinition( - const SmallPtrSetImpl &InScopeBlocks, - MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, - DebugVariableID Var, LiveInsT &Output); + const SmallPtrSetImpl &InScopeBlocks, + MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, + const DebugVariable &Var, LiveInsT &Output); /// Calculate the iterated-dominance-frontier for a set of defs, using the /// existing LLVM facilities for this. Works for a single "value" or @@ -1434,7 +1381,7 @@ class InstrRefBasedLDV : public LDVImpl { /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks /// locations through. void buildVLocValueMap(const DILocation *DILoc, - const SmallSet &VarsWeCareAbout, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, @@ -1467,8 +1414,10 @@ class InstrRefBasedLDV : public LDVImpl { const SmallVectorImpl &BlockOrders); /// Take collections of DBG_VALUE instructions stored in TTracker, and - /// install them into their output blocks. - bool emitTransfers(); + /// install them into their output blocks. Preserves a stable order of + /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through + /// the AllVarsNumbering order. + bool emitTransfers(DenseMap &AllVarsNumbering); /// Boilerplate computation of some initial sets, artifical blocks and /// RPOT block ordering. @@ -1488,14 +1437,13 @@ class InstrRefBasedLDV : public LDVImpl { /// block information can be fully computed before exploration finishes, /// allowing us to emit it and free data structures earlier than otherwise. /// It's also good for locality. - bool depthFirstVLocAndEmit(unsigned MaxNumBlocks, - const ScopeToDILocT &ScopeToDILocation, - const ScopeToVarsT &ScopeToVars, - ScopeToAssignBlocksT &ScopeToBlocks, - LiveInsT &Output, FuncValueTable &MOutLocs, - FuncValueTable &MInLocs, - SmallVectorImpl &AllTheVLocs, - MachineFunction &MF, const TargetPassConfig &TPC); + bool depthFirstVLocAndEmit( + unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation, + const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks, + LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, + SmallVectorImpl &AllTheVLocs, MachineFunction &MF, + DenseMap &AllVarsNumbering, + const TargetPassConfig &TPC); bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree, TargetPassConfig *TPC, unsigned InputBBLimit, diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir index 67bfd85dcb379..b54c748ac9e84 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir @@ -17,12 +17,12 @@ # CHECK-LABEL: bb.3.bb3: # CHECK: DBG_VALUE $ecx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 0, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 0, 32) # CHECK-SAME: $ecx, $r8d # CHECK-NEXT: DBG_VALUE $ebx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 32, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 32, 32) # CHECK-SAME: $ebx, $r10d # CHECK-NEXT: XOR32rr diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index 50a8cb97ae061..306a97c3149cc 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -55,7 +55,6 @@ class InstrRefLDVTest : public testing::Test { DIBasicType *LongInt; DIExpression *EmptyExpr; LiveDebugValues::OverlapMap Overlaps; - LiveDebugValues::DebugVariableMap DVMap; DebugLoc OutermostLoc, InBlockLoc, NotNestedBlockLoc, InlinedLoc; @@ -177,7 +176,7 @@ class InstrRefLDVTest : public testing::Test { void addVTracker() { ASSERT_TRUE(LDV); - VTracker = std::make_unique(DVMap, Overlaps, EmptyExpr); + VTracker = std::make_unique(Overlaps, EmptyExpr); LDV->VTracker = &*VTracker; } From 1a80153ba91f1e623c042fa0ae1ee5ab67087c0e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 18 Jul 2024 10:11:39 -0400 Subject: [PATCH 022/486] [LV][NFC]Simplify the structure and improve message of safe distance analysis for scalable vectorization. (#99487) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 40919c944d21f..7ca798a8b2d89 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4197,14 +4197,11 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() { return false; } - if (!Legal->isSafeForAnyVectorWidth()) { - std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); - if (!MaxVScale) { - reportVectorizationInfo( - "The target does not provide maximum vscale value.", - "ScalableVFUnfeasible", ORE, TheLoop); - return false; - } + if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) { + reportVectorizationInfo("The target does not provide maximum vscale value " + "for safe distance analysis.", + "ScalableVFUnfeasible", ORE, TheLoop); + return false; } IsScalableVectorizationAllowed = true; From 92f9f014015554c5dd18df4699765cc42853a04d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Jul 2024 15:09:40 +0100 Subject: [PATCH 023/486] [X86] getGFNICtrlMask - create a vXi8 mask instead of a bitcasted vXi64 mask. Helps avoid some missed load-folds by stripping away bitcasts and make it easier to grok the GF2P8AFFINEQB masks. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +- llvm/test/CodeGen/X86/bitreverse.ll | 2 +- llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 141 +++++++-------- llvm/test/CodeGen/X86/gfni-rotates.ll | 188 +++++++++----------- llvm/test/CodeGen/X86/gfni-shifts.ll | 127 ++++++------- llvm/test/CodeGen/X86/vector-bitreverse.ll | 71 +++----- 6 files changed, 236 insertions(+), 301 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 56d08e7f76908..9d742be43408f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29102,8 +29102,12 @@ SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT assert(VT.getVectorElementType() == MVT::i8 && (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type"); uint64_t Imm = getGFNICtrlImm(Opcode, Amt); - MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); - return DAG.getBitcast(VT, DAG.getConstant(Imm, DL, MaskVT)); + SmallVector MaskBits; + for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) { + uint64_t Bits = (Imm >> (I % 64)) & 255; + MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8)); + } + return DAG.getBuildVector(VT, DL, MaskBits); } // Return true if the required (according to Opcode) shift-imm form is natively diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 4f2654843728f..e256b811ee839 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -1340,7 +1340,7 @@ define i528 @large_promotion(i528 %A) nounwind { ; GFNI-NEXT: pushq %r14 ; GFNI-NEXT: pushq %rbx ; GFNI-NEXT: movq %rdi, %rax -; GFNI-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9241421688590303745,9241421688590303745] +; GFNI-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; GFNI-NEXT: vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1 ; GFNI-NEXT: vmovq %xmm1, %r10 diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 5857ff1162ceb..c071f64dc66cd 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -699,7 +699,7 @@ define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 @@ -743,7 +743,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm6 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: movdqa %xmm2, %xmm9 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] @@ -751,12 +751,12 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: pand %xmm7, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: movdqa %xmm2, %xmm10 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: movdqa %xmm2, %xmm11 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 @@ -769,7 +769,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNISSE-NEXT: psllw $5, %xmm6 ; GFNISSE-NEXT: movdqa %xmm6, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm4, %xmm13 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13 ; GFNISSE-NEXT: paddb %xmm6, %xmm6 @@ -819,7 +819,7 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-LABEL: var_fshr_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm4, %xmm6 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] @@ -827,25 +827,25 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm8 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [290499906672525312,290499906672525312] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX1-NEXT: # xmm6 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm4, %xmm9 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm4, %xmm4 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [145249953336295424,145249953336295424] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: # xmm9 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm4, %xmm10 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm4, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm8 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm10 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm8, %xmm11 ; GFNIAVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm11, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm11 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm11 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm11, %xmm8, %xmm12 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 @@ -880,33 +880,28 @@ define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nou ; ; GFNIAVX2-LABEL: var_fshr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3 -; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm5 -; GFNIAVX2-NEXT: vpsllw $5, %ymm5, %ymm5 -; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm3 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4 +; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm6 +; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 +; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 ; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 -; GFNIAVX2-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 -; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 +; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 +; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -1288,7 +1283,7 @@ define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm5 = [16909320,16909320] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 @@ -1307,10 +1302,8 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; ; GFNIAVX2-LABEL: splatconstant_fshl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -1328,9 +1321,9 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: por %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 @@ -1347,10 +1340,8 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; ; GFNIAVX2-LABEL: splatconstant_fshr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; @@ -1686,9 +1677,9 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-LABEL: var_fshl_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm6 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2 @@ -1696,7 +1687,7 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm8, %ymm9 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9 ; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm3, %ymm10 ; GFNIAVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm3, %ymm3 @@ -1716,11 +1707,11 @@ define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1 ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm8, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm3, %ymm3 @@ -1805,19 +1796,19 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1 ; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm8 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 ; GFNISSE-NEXT: pand %xmm12, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 ; GFNISSE-NEXT: movdqa %xmm6, %xmm8 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm14 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm14 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm14, %xmm8 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6 @@ -1830,7 +1821,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2 ; GFNISSE-NEXT: movdqa %xmm2, %xmm8 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm0 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm0 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm0, %xmm8 ; GFNISSE-NEXT: paddb %xmm9, %xmm9 ; GFNISSE-NEXT: movdqa %xmm9, %xmm0 @@ -1917,7 +1908,7 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-LABEL: var_fshr_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm7 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm8, %xmm9 ; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] @@ -1925,25 +1916,25 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX1-NEXT: vextractf128 $1, %ymm11, %xmm10 ; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm12 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm8, %xmm9 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm9 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [145249953336295424,145249953336295424] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: # xmm8 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm9, %xmm13 ; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm13, %xmm9, %xmm12 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 ; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm13 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm9 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpxor %xmm6, %xmm10, %xmm10 ; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm15 ; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm10 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm10 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm10, %xmm13, %xmm14 ; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15 @@ -2025,27 +2016,27 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; ; GFNIAVX2-LABEL: var_fshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm8 ; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX2-NEXT: vpand %ymm6, %ymm4, %ymm9 ; GFNIAVX2-NEXT: vpsllw $5, %ymm9, %ymm9 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm10 ; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm10, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11 ; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16909320,16909320,16909320,16909320] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm11 ; GFNIAVX2-NEXT: vpandn %ymm6, %ymm4, %ymm4 ; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm11, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm11 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm11, %ymm0, %ymm12 ; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 ; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0 @@ -2080,18 +2071,18 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-LABEL: var_fshr_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 ; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; GFNIAVX512VL-NEXT: vpandq %zmm6, %zmm2, %zmm2 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm7 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm8 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm9, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm9 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm9, %ymm3, %ymm10 ; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8 ; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm10, %ymm3, %ymm3 @@ -2107,12 +2098,12 @@ define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nou ; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm5 ; GFNIAVX512VL-NEXT: vpxor %ymm6, %ymm7, %ymm7 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm3, %ymm8 ; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 ; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm8, %ymm3, %ymm3 @@ -2727,7 +2718,7 @@ define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: por %xmm4, %xmm0 @@ -2761,7 +2752,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -2794,7 +2785,7 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-LABEL: splatconstant_fshr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm4 ; GFNISSE-NEXT: pmovsxwq {{.*#+}} xmm9 = [258,258] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 @@ -2824,9 +2815,9 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [258,258,258,258] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index cc077410228cb..5fd4dfa7cc262 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -415,7 +415,7 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm4 ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm6 = [16909320,16909320] ; GFNISSE-NEXT: movdqa %xmm2, %xmm7 @@ -424,17 +424,17 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-NEXT: psllw $5, %xmm4 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNISSE-NEXT: movdqa %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm2, %xmm9 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm9 ; GFNISSE-NEXT: por %xmm0, %xmm9 ; GFNISSE-NEXT: paddb %xmm4, %xmm4 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: movdqa %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 ; GFNISSE-NEXT: movdqa %xmm2, %xmm10 @@ -473,26 +473,26 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX1-LABEL: var_rotl_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm3 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6 ; GFNIAVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 ; GFNIAVX1-NEXT: vpsllw $5, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm7 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm8 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm9 ; GFNIAVX1-NEXT: vpor %xmm7, %xmm9, %xmm7 ; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm7, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX1-NEXT: # xmm7 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm2, %xmm9 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm10 @@ -519,22 +519,17 @@ define <32 x i8> @var_rotl_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; ; GFNIAVX2-LABEL: var_rotl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -581,7 +576,7 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-LABEL: var_rotr_v32i8: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm0 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm7 = [16909320,16909320] ; GFNISSE-NEXT: movdqa %xmm5, %xmm8 @@ -592,16 +587,16 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNISSE-NEXT: psubb %xmm2, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNISSE-NEXT: movdqa %xmm5, %xmm9 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm5, %xmm10 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm8, %xmm10 ; GFNISSE-NEXT: por %xmm9, %xmm10 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm5 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: movdqa %xmm5, %xmm10 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10 ; GFNISSE-NEXT: movdqa %xmm5, %xmm11 @@ -640,10 +635,10 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX1-LABEL: var_rotr_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm3 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm2, %xmm6 ; GFNIAVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 @@ -652,16 +647,16 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpsubb %xmm6, %xmm7, %xmm6 ; GFNIAVX1-NEXT: vpsllw $5, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm9 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm2, %xmm10 ; GFNIAVX1-NEXT: vpor %xmm8, %xmm10, %xmm8 ; GFNIAVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 ; GFNIAVX1-NEXT: vpblendvb %xmm6, %xmm8, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX1-NEXT: # xmm8 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm2, %xmm10 ; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm11 @@ -689,24 +684,19 @@ define <32 x i8> @var_rotr_v32i8(<32 x i8> %a, <32 x i8> %amt) nounwind { ; ; GFNIAVX2-LABEL: var_rotr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm3, %ymm1 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1108169199648,1108169199648,1108169199648,1108169199648] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm3 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm3 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -1075,21 +1065,15 @@ define <32 x i8> @constant_rotr_v32i8(<32 x i8> %a) nounwind { define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotl_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1161999622378488840,1161999622378488840] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: splatconstant_rotl_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: splatconstant_rotl_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1161999622378488840,1161999622378488840,1161999622378488840,1161999622378488840] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_rotl_v32i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotl_v32i8: ; GFNIAVX512: # %bb.0: @@ -1103,21 +1087,15 @@ declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [4647715923615551520,4647715923615551520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: splatconstant_rotr_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: splatconstant_rotr_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4647715923615551520,4647715923615551520,4647715923615551520,4647715923615551520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_rotr_v32i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_rotr_v32i8: ; GFNIAVX512: # %bb.0: @@ -1137,7 +1115,7 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm8 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm0 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm10 = [16909320,16909320] ; GFNISSE-NEXT: movdqa %xmm4, %xmm11 @@ -1146,17 +1124,17 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-NEXT: psllw $5, %xmm8 ; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm4, %xmm13 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm13 ; GFNISSE-NEXT: por %xmm0, %xmm13 ; GFNISSE-NEXT: paddb %xmm8, %xmm8 ; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm0 ; GFNISSE-NEXT: movdqa %xmm4, %xmm14 @@ -1243,26 +1221,26 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-LABEL: var_rotl_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm6, %xmm7 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8 ; GFNIAVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 ; GFNIAVX1-NEXT: vpsllw $5, %xmm8, %xmm8 ; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm7, %xmm6, %xmm9 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [4647714815446351872,4647714815446351872] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX1-NEXT: # xmm6 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm9, %xmm10 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm7 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm9, %xmm11 ; GFNIAVX1-NEXT: vpor %xmm10, %xmm11, %xmm10 ; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm11 ; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm10, %xmm9, %xmm9 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX1-NEXT: # xmm8 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm9, %xmm10 ; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm12 @@ -1322,21 +1300,21 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; ; GFNIAVX2-LABEL: var_rotl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 ; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm7 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm9 ; GFNIAVX2-NEXT: vpor %ymm7, %ymm9, %ymm7 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm7, %ymm0, %ymm9 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm10 ; GFNIAVX2-NEXT: vpor %ymm9, %ymm10, %ymm9 @@ -1362,22 +1340,22 @@ define <64 x i8> @var_rotl_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-LABEL: var_rotl_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16909320,16909320,16909320,16909320] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6 ; GFNIAVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9 ; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm10 ; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm10, %ymm9 @@ -1422,7 +1400,7 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-LABEL: var_rotr_v64i8: ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm0, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm0 ; GFNISSE-NEXT: pmovsxdq {{.*#+}} xmm11 = [16909320,16909320] ; GFNISSE-NEXT: movdqa %xmm9, %xmm12 @@ -1433,16 +1411,16 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNISSE-NEXT: psubb %xmm4, %xmm0 ; GFNISSE-NEXT: psllw $5, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [4647714815446351872,4647714815446351872] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNISSE-NEXT: movdqa %xmm9, %xmm13 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm13 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm9, %xmm14 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm12, %xmm14 ; GFNISSE-NEXT: por %xmm13, %xmm14 ; GFNISSE-NEXT: paddb %xmm0, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm9 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: movdqa %xmm9, %xmm14 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm13, %xmm14 ; GFNISSE-NEXT: movdqa %xmm9, %xmm15 @@ -1527,10 +1505,10 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-LABEL: var_rotr_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm7, %xmm6 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm7, %xmm8 ; GFNIAVX1-NEXT: vpor %xmm6, %xmm8, %xmm8 @@ -1539,16 +1517,16 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX1-NEXT: vpsubb %xmm9, %xmm6, %xmm9 ; GFNIAVX1-NEXT: vpsllw $5, %xmm9, %xmm9 ; GFNIAVX1-NEXT: vpblendvb %xmm9, %xmm8, %xmm7, %xmm10 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [4647714815446351872,4647714815446351872] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX1-NEXT: # xmm7 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm7, %xmm10, %xmm11 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm8 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm8 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm8, %xmm10, %xmm12 ; GFNIAVX1-NEXT: vpor %xmm11, %xmm12, %xmm11 ; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm12 ; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm11, %xmm10, %xmm10 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX1-NEXT: # xmm9 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm9, %xmm10, %xmm11 ; GFNIAVX1-NEXT: vpaddb %xmm10, %xmm10, %xmm13 @@ -1611,23 +1589,23 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; ; GFNIAVX2-LABEL: var_rotr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16909320,16909320,16909320,16909320] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 ; GFNIAVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 ; GFNIAVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7 ; GFNIAVX2-NEXT: vpsubb %ymm2, %ymm7, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [4647714815446351872,4647714815446351872,4647714815446351872,4647714815446351872] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64,0,0,0,0,0,0,128,64] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm8 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm9, %ymm0, %ymm10 ; GFNIAVX2-NEXT: vpor %ymm8, %ymm10, %ymm8 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm8, %ymm0, %ymm10 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm11 ; GFNIAVX2-NEXT: vpor %ymm10, %ymm11, %ymm10 @@ -1654,24 +1632,24 @@ define <64 x i8> @var_rotr_v64i8(<64 x i8> %a, <64 x i8> %amt) nounwind { ; GFNIAVX512VL-LABEL: var_rotr_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm5, %ymm2, %ymm6 ; GFNIAVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [258,258,258,258] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm7 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm8, %ymm2, %ymm9 ; GFNIAVX512VL-NEXT: vpor %ymm7, %ymm9, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 ; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm7, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,1,1,1] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm7, %ymm2, %ymm9 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm10, %ymm2, %ymm11 ; GFNIAVX512VL-NEXT: vpor %ymm9, %ymm11, %ymm9 ; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6 @@ -2257,7 +2235,7 @@ define <64 x i8> @constant_rotr_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotl_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9223655728169885760,9223655728169885760] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 @@ -2266,14 +2244,14 @@ define <64 x i8> @splatconstant_rotl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX1-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760] +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223655728169885760,9223655728169885760,9223655728169885760,9223655728169885760] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq @@ -2290,7 +2268,7 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_rotr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [290499906672525570,290499906672525570] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 @@ -2299,14 +2277,14 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX1-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570] +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [290499906672525570,290499906672525570,290499906672525570,290499906672525570] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll index fc57b84ab9f02..6ed524e406826 100644 --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -615,7 +615,7 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE-NEXT: psllw $5, %xmm4 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm2, %xmm7 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7 ; GFNISSE-NEXT: paddb %xmm4, %xmm4 @@ -647,13 +647,13 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-LABEL: var_shl_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm3 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 @@ -675,12 +675,10 @@ define <32 x i8> @var_shl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; GFNIAVX2-LABEL: var_shl_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16909320,16909320,16909320,16909320] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1108169199648,1108169199648,1108169199648,1108169199648] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 @@ -717,19 +715,19 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm2, %xmm4 ; GFNISSE-NEXT: movdqa %xmm0, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: movdqa %xmm0, %xmm6 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm6 ; GFNISSE-NEXT: psllw $5, %xmm4 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: movdqa %xmm2, %xmm7 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm6, %xmm7 ; GFNISSE-NEXT: paddb %xmm4, %xmm4 ; GFNISSE-NEXT: movdqa %xmm4, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm7, %xmm2 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: movdqa %xmm2, %xmm8 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm7, %xmm8 ; GFNISSE-NEXT: paddb %xmm4, %xmm4 @@ -756,18 +754,18 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; GFNIAVX1-LABEL: var_lshr_v32i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm3 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm3, %xmm2, %xmm4 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [290499906672525312,290499906672525312] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm2, %xmm6 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 ; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm6, %xmm2, %xmm2 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: # xmm6 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm2, %xmm7 ; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 @@ -786,16 +784,13 @@ define <32 x i8> @var_lshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; ; GFNIAVX2-LABEL: var_lshr_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm2 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -1459,16 +1454,10 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: splatconstant_shl_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: splatconstant_shl_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [258,258,258,258] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_shl_v32i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_shl_v32i8: ; GFNIAVX512: # %bb.0: @@ -1481,21 +1470,15 @@ define <32 x i8> @splatconstant_shl_v32i8(<32 x i8> %a) nounwind { define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_lshr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: splatconstant_lshr_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: splatconstant_lshr_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_lshr_v32i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_lshr_v32i8: ; GFNIAVX512: # %bb.0: @@ -1508,21 +1491,15 @@ define <32 x i8> @splatconstant_lshr_v32i8(<32 x i8> %a) nounwind { define <32 x i8> @splatconstant_ashr_v32i8(<32 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_ashr_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [290499906672558208,290499906672558208] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,64,32,16,8,4,128,128,128,64,32,16,8,4] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: splatconstant_ashr_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: splatconstant_ashr_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [290499906672558208,290499906672558208,290499906672558208,290499906672558208] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq +; GFNIAVX1OR2-LABEL: splatconstant_ashr_v32i8: +; GFNIAVX1OR2: # %bb.0: +; GFNIAVX1OR2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1OR2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v32i8: ; GFNIAVX512: # %bb.0: @@ -1547,7 +1524,7 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE-NEXT: psllw $5, %xmm8 ; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [1108169199648,1108169199648] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNISSE-NEXT: movdqa %xmm4, %xmm11 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11 ; GFNISSE-NEXT: paddb %xmm8, %xmm8 @@ -1609,13 +1586,13 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-LABEL: var_shl_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [16909320,16909320] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm5, %xmm6 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1108169199648,1108169199648] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 @@ -1658,11 +1635,11 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX2-LABEL: var_shl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16909320,16909320,16909320,16909320] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 @@ -1683,12 +1660,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512VL-LABEL: var_shl_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [16909320,16909320,16909320,16909320] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0,8,4,2,1,0,0,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1108169199648,1108169199648,1108169199648,1108169199648] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0,32,16,8,4,2,1,0,0] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 @@ -1728,19 +1705,19 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa %xmm4, %xmm8 ; GFNISSE-NEXT: movdqa %xmm0, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [1161999622361579520,1161999622361579520] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNISSE-NEXT: movdqa %xmm0, %xmm10 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm9, %xmm10 ; GFNISSE-NEXT: psllw $5, %xmm8 ; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [290499906672525312,290499906672525312] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNISSE-NEXT: movdqa %xmm4, %xmm11 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm10, %xmm11 ; GFNISSE-NEXT: paddb %xmm8, %xmm8 ; GFNISSE-NEXT: movdqa %xmm8, %xmm0 ; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm4 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [145249953336295424,145249953336295424] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNISSE-NEXT: movdqa %xmm4, %xmm12 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm11, %xmm12 ; GFNISSE-NEXT: paddb %xmm8, %xmm8 @@ -1797,18 +1774,18 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX1-LABEL: var_lshr_v64i8: ; GFNIAVX1: # %bb.0: ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1161999622361579520,1161999622361579520] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX1-NEXT: # xmm4 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm4, %xmm5, %xmm6 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 ; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm6, %xmm5, %xmm6 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [290499906672525312,290499906672525312] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX1-NEXT: # xmm5 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm5, %xmm6, %xmm8 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 ; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm8, %xmm6, %xmm8 -; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [145249953336295424,145249953336295424] +; GFNIAVX1-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX1-NEXT: # xmm6 = mem[0,0] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %xmm6, %xmm8, %xmm9 ; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7 @@ -1848,15 +1825,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; GFNIAVX2-LABEL: var_lshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm4, %ymm0, %ymm5 ; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm5, %ymm0, %ymm6 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm6, %ymm0, %ymm7 ; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0 @@ -1874,16 +1851,16 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; GFNIAVX512VL-LABEL: var_lshr_v64i8: ; GFNIAVX512VL: # %bb.0: ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1161999622361579520,1161999622361579520,1161999622361579520,1161999622361579520] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm3, %ymm2, %ymm4 ; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; GFNIAVX512VL-NEXT: vpsllw $5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [290499906672525312,290499906672525312,290499906672525312,290499906672525312] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4,0,0,128,64,32,16,8,4] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm4, %ymm2, %ymm6 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm2 -; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [145249953336295424,145249953336295424,145249953336295424,145249953336295424] +; GFNIAVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2,0,128,64,32,16,8,4,2] ; GFNIAVX512VL-NEXT: vgf2p8affineqb $0, %ymm6, %ymm2, %ymm7 ; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 @@ -2999,7 +2976,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: splatconstant_shl_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [66052,66052,66052,66052] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0,4,2,1,0,0,0,0,0] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq @@ -3015,7 +2992,7 @@ define <64 x i8> @splatconstant_shl_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_lshr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 @@ -3031,7 +3008,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq @@ -3047,7 +3024,7 @@ define <64 x i8> @splatconstant_lshr_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; GFNISSE-LABEL: splatconstant_ashr_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [145249953336295552,145249953336295552] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 @@ -3063,7 +3040,7 @@ define <64 x i8> @splatconstant_ashr_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [145249953336295552,145249953336295552,145249953336295552,145249953336295552] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2,128,128,64,32,16,8,4,2] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index c54a7f4642253..5dcf19013f0b7 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -875,27 +875,15 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { ; ; GFNISSE-LABEL: test_bitreverse_v32i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 ; GFNISSE-NEXT: retq ; -; GFNIAVX1-LABEL: test_bitreverse_v32i8: -; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX1-NEXT: retq -; -; GFNIAVX2-LABEL: test_bitreverse_v32i8: -; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: retq -; -; GFNIAVX512-LABEL: test_bitreverse_v32i8: -; GFNIAVX512: # %bb.0: -; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 -; GFNIAVX512-NEXT: retq +; GFNIAVX-LABEL: test_bitreverse_v32i8: +; GFNIAVX: # %bb.0: +; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX-NEXT: retq %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ret <32 x i8> %b } @@ -1058,7 +1046,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 @@ -1071,21 +1059,20 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v16i16: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v16i16: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %b @@ -1258,7 +1245,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 @@ -1271,21 +1258,20 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v8i32: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v8i32: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %b @@ -1462,7 +1448,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 @@ -1475,21 +1461,20 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 ; GFNIAVX1-NEXT: retq ; ; GFNIAVX2-LABEL: test_bitreverse_v4i64: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v4i64: ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX512-NEXT: retq %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %b @@ -1741,7 +1726,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; GFNISSE-LABEL: test_bitreverse_v64i8: ; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 @@ -1757,7 +1742,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; GFNIAVX2-LABEL: test_bitreverse_v64i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: retq @@ -2054,7 +2039,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 @@ -2085,7 +2070,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 @@ -2412,7 +2397,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 @@ -2443,7 +2428,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 @@ -2778,7 +2763,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; GFNISSE: # %bb.0: ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745] +; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 @@ -2809,7 +2794,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1] ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1 From 9fae0c6f9c05915a5daac5b368258a40e1fab237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 15:30:42 +0200 Subject: [PATCH 024/486] Reapply "[clang][Interp] Fix CheckCallable for undefined-and-not-constexpr fns" This reverts commit ad7aeb0ff58ebd29f68adb85c64e8010639e2a76. --- clang/lib/AST/Interp/Interp.cpp | 87 +++++++++++++++++---------------- clang/lib/AST/Interp/Interp.h | 6 +-- clang/test/AST/Interp/cxx2a.cpp | 15 ++++++ 3 files changed, 64 insertions(+), 44 deletions(-) create mode 100644 clang/test/AST/Interp/cxx2a.cpp diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 2be9b5360d055..e6e9298982887 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -579,57 +579,62 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { return false; } - if (!F->isConstexpr() || !F->hasBody()) { - const SourceLocation &Loc = S.Current->getLocation(OpPC); - if (S.getLangOpts().CPlusPlus11) { - const FunctionDecl *DiagDecl = F->getDecl(); + if (F->isConstexpr() && F->hasBody() && + (F->getDecl()->isConstexpr() || F->getDecl()->hasAttr())) + return true; - // Invalid decls have been diagnosed before. - if (DiagDecl->isInvalidDecl()) - return false; + // Implicitly constexpr. + if (F->isLambdaStaticInvoker()) + return true; - // If this function is not constexpr because it is an inherited - // non-constexpr constructor, diagnose that directly. - const auto *CD = dyn_cast(DiagDecl); - if (CD && CD->isInheritingConstructor()) { - const auto *Inherited = CD->getInheritedConstructor().getConstructor(); - if (!Inherited->isConstexpr()) - DiagDecl = CD = Inherited; - } + const SourceLocation &Loc = S.Current->getLocation(OpPC); + if (S.getLangOpts().CPlusPlus11) { + const FunctionDecl *DiagDecl = F->getDecl(); + + // Invalid decls have been diagnosed before. + if (DiagDecl->isInvalidDecl()) + return false; + + // If this function is not constexpr because it is an inherited + // non-constexpr constructor, diagnose that directly. + const auto *CD = dyn_cast(DiagDecl); + if (CD && CD->isInheritingConstructor()) { + const auto *Inherited = CD->getInheritedConstructor().getConstructor(); + if (!Inherited->isConstexpr()) + DiagDecl = CD = Inherited; + } - // FIXME: If DiagDecl is an implicitly-declared special member function - // or an inheriting constructor, we should be much more explicit about why - // it's not constexpr. - if (CD && CD->isInheritingConstructor()) { - S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) + // FIXME: If DiagDecl is an implicitly-declared special member function + // or an inheriting constructor, we should be much more explicit about why + // it's not constexpr. + if (CD && CD->isInheritingConstructor()) { + S.FFDiag(Loc, diag::note_constexpr_invalid_inhctor, 1) << CD->getInheritedConstructor().getConstructor()->getParent(); - S.Note(DiagDecl->getLocation(), diag::note_declared_at); - } else { - // Don't emit anything if the function isn't defined and we're checking - // for a constant expression. It might be defined at the point we're - // actually calling it. - bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; - if (!DiagDecl->isDefined() && !IsExtern && - S.checkingPotentialConstantExpression()) - return false; + S.Note(DiagDecl->getLocation(), diag::note_declared_at); + } else { + // Don't emit anything if the function isn't defined and we're checking + // for a constant expression. It might be defined at the point we're + // actually calling it. + bool IsExtern = DiagDecl->getStorageClass() == SC_Extern; + if (!DiagDecl->isDefined() && !IsExtern && DiagDecl->isConstexpr() && + S.checkingPotentialConstantExpression()) + return false; - // If the declaration is defined, declared 'constexpr' _and_ has a body, - // the below diagnostic doesn't add anything useful. - if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && - DiagDecl->hasBody()) - return false; + // If the declaration is defined, declared 'constexpr' _and_ has a body, + // the below diagnostic doesn't add anything useful. + if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && + DiagDecl->hasBody()) + return false; - S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) + S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) << DiagDecl->isConstexpr() << (bool)CD << DiagDecl; - S.Note(DiagDecl->getLocation(), diag::note_declared_at); - } - } else { - S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); + S.Note(DiagDecl->getLocation(), diag::note_declared_at); } - return false; + } else { + S.FFDiag(Loc, diag::note_invalid_subexpr_in_const_expr); } - return true; + return false; } bool CheckCallDepth(InterpState &S, CodePtr OpPC) { diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 17b3157cb40a9..2e159012f5ffd 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2531,14 +2531,14 @@ inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func, if (!CheckInvoke(S, OpPC, ThisPtr)) return false; } - - if (S.checkingPotentialConstantExpression()) - return false; } if (!CheckCallable(S, OpPC, Func)) return false; + if (Func->hasThisPointer() && S.checkingPotentialConstantExpression()) + return false; + if (!CheckCallDepth(S, OpPC)) return false; diff --git a/clang/test/AST/Interp/cxx2a.cpp b/clang/test/AST/Interp/cxx2a.cpp new file mode 100644 index 0000000000000..27d1aa1a27f75 --- /dev/null +++ b/clang/test/AST/Interp/cxx2a.cpp @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=ref,both %s +// RUN: %clang_cc1 -std=c++2a -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter + +template +struct S { + S() requires (N==1) = default; + S() requires (N==2) {} // both-note {{declared here}} + consteval S() requires (N==3) = default; +}; + +consteval int aConstevalFunction() { // both-error {{consteval function never produces a constant expression}} + S<2> s4; // both-note {{non-constexpr constructor 'S' cannot be used in a constant expression}} + return 0; +} +/// We're NOT calling the above function. The diagnostics should appear anyway. From cd495d2cdd84a22026a115c7e9923c27b196732e Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Thu, 18 Jul 2024 16:18:34 +0200 Subject: [PATCH 025/486] Add source file name for template instantiations in -ftime-trace (#98320) This is helpful in identifying file and location which contain the particular template declaration. --- a-abfdec1d.o.tmp | 0 clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/Driver/Options.td | 4 + .../include/clang/Frontend/FrontendOptions.h | 8 +- clang/lib/Driver/ToolChains/Clang.cpp | 1 + clang/lib/Sema/SemaTemplateInstantiate.cpp | 11 +- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 11 +- clang/test/Driver/ftime-trace-sections.cpp | 2 +- clang/test/Driver/ftime-trace.cpp | 39 +++--- clang/tools/driver/cc1_main.cpp | 3 +- clang/unittests/Support/TimeProfilerTest.cpp | 121 ++++++++++++++---- llvm/include/llvm/Support/TimeProfiler.h | 23 +++- llvm/lib/Support/TimeProfiler.cpp | 61 +++++++-- 13 files changed, 223 insertions(+), 64 deletions(-) create mode 100644 a-abfdec1d.o.tmp diff --git a/a-abfdec1d.o.tmp b/a-abfdec1d.o.tmp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e0e86af257a19..971df672b6ca1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -736,6 +736,9 @@ Improvements to Clang's time-trace - Clang now specifies that using ``auto`` in a lambda parameter is a C++14 extension when appropriate. (`#46059: `_). +- Clang now adds source file infomation for template instantiations as ``event["args"]["filename"]``. This + added behind an option ``-ftime-trace-verbose``. This is expected to increase the size of trace by 2-3 times. + Improvements to Coverage Mapping -------------------------------- diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 1675e435d210c..d3068c1b30a7a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3988,6 +3988,10 @@ def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Grou HelpText<"Minimum time granularity (in microseconds) traced by time profiler">, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, MarshallingInfoInt, "500u">; +def ftime_trace_verbose : Joined<["-"], "ftime-trace-verbose">, Group, + HelpText<"Make time trace capture verbose event details (e.g. source filenames). This can increase the size of the output by 2-3 times">, + Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, + MarshallingInfoFlag>; def ftime_trace_EQ : Joined<["-"], "ftime-trace=">, Group, HelpText<"Similar to -ftime-trace. Specify the JSON file or a directory which will contain the JSON file">, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h index 5e5034fe01eb5..8241925c98476 100644 --- a/clang/include/clang/Frontend/FrontendOptions.h +++ b/clang/include/clang/Frontend/FrontendOptions.h @@ -580,6 +580,11 @@ class FrontendOptions { /// Minimum time granularity (in microseconds) traced by time profiler. unsigned TimeTraceGranularity; + /// Make time trace capture verbose event details (e.g. source filenames). + /// This can increase the size of the output by 2-3 times. + LLVM_PREFERRED_TYPE(bool) + unsigned TimeTraceVerbose : 1; + /// Path which stores the output files for -ftime-trace std::string TimeTracePath; @@ -601,7 +606,8 @@ class FrontendOptions { EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false), EmitSymbolGraphSymbolLabelsForTesting(false), EmitPrettySymbolGraphs(false), GenReducedBMI(false), - UseClangIRPipeline(false), TimeTraceGranularity(500) {} + UseClangIRPipeline(false), TimeTraceGranularity(500), + TimeTraceVerbose(false) {} /// getInputKindForExtension - Return the appropriate input kind for a file /// extension. For example, "c" would return Language::C. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 1fd6fba210042..6b33301d36401 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6754,6 +6754,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (const char *Name = C.getTimeTraceFile(&JA)) { CmdArgs.push_back(Args.MakeArgString("-ftime-trace=" + Twine(Name))); Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ); + Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_verbose); } if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) { diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index a7bc6749c5852..725b62db5e80a 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -3426,11 +3426,16 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation, return true; llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() { - std::string Name; - llvm::raw_string_ostream OS(Name); + llvm::TimeTraceMetadata M; + llvm::raw_string_ostream OS(M.Detail); Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(), /*Qualified=*/true); - return Name; + if (llvm::isTimeTraceVerbose()) { + auto Loc = SourceMgr.getExpansionLoc(Instantiation->getLocation()); + M.File = SourceMgr.getFilename(Loc); + M.Line = SourceMgr.getExpansionLineNumber(Loc); + } + return M; }); Pattern = PatternDef; diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 01432301633ed..4e619f4b491a6 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -4966,11 +4966,16 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, } llvm::TimeTraceScope TimeScope("InstantiateFunction", [&]() { - std::string Name; - llvm::raw_string_ostream OS(Name); + llvm::TimeTraceMetadata M; + llvm::raw_string_ostream OS(M.Detail); Function->getNameForDiagnostic(OS, getPrintingPolicy(), /*Qualified=*/true); - return Name; + if (llvm::isTimeTraceVerbose()) { + auto Loc = SourceMgr.getExpansionLoc(Function->getLocation()); + M.File = SourceMgr.getFilename(Loc); + M.Line = SourceMgr.getExpansionLineNumber(Loc); + } + return M; }); // If we're performing recursive template instantiation, create our own diff --git a/clang/test/Driver/ftime-trace-sections.cpp b/clang/test/Driver/ftime-trace-sections.cpp index 0c16052bc0c3a..da7109b9d81a6 100644 --- a/clang/test/Driver/ftime-trace-sections.cpp +++ b/clang/test/Driver/ftime-trace-sections.cpp @@ -1,5 +1,5 @@ // RUN: rm -rf %t && mkdir %t && cd %t -// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o out %s +// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s // RUN: %python %S/ftime-trace-sections.py < out.json template diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp index 5fe63de915a71..60c5885704b58 100644 --- a/clang/test/Driver/ftime-trace.cpp +++ b/clang/test/Driver/ftime-trace.cpp @@ -1,18 +1,18 @@ // RUN: rm -rf %t && mkdir -p %t && cd %t -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s // RUN: cat out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s // RUN: cat new-name.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s // RUN: mkdir dir1 dir2 -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s // RUN: cat dir1/out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s // RUN: cat dir2/out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s @@ -34,32 +34,33 @@ // RUN: mkdir d e f && cp %s d/a.cpp && touch d/b.c /// TODO: Support -fno-integrated-as. -// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1 -// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" +// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1 +// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2 -// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" -// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" +// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2 +// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" /// -o specifies the link output. Create ${output}-${basename}.json. -// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1 -// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" -// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" +// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1 +// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" /// -dumpdir is f/g, not ending with a path separator. We create f/g${basename}.json. -// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2 -// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" -// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" +// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2 +// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3 -// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" -// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" +// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3 +// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -xassembler d/a.cpp 2>&1 | \ +// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -ftime-trace-verbose -xassembler d/a.cpp 2>&1 | \ // RUN: FileCheck %s --check-prefix=UNUSED // UNUSED: warning: argument unused during compilation: '-ftime-trace' // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace=e' // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-granularity=1' +// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose' // UNUSED-NOT: warning: template diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp index c2ccb47a15bc8..f5e5fad36573e 100644 --- a/clang/tools/driver/cc1_main.cpp +++ b/clang/tools/driver/cc1_main.cpp @@ -241,7 +241,8 @@ int cc1_main(ArrayRef Argv, const char *Argv0, void *MainAddr) { if (!Clang->getFrontendOpts().TimeTracePath.empty()) { llvm::timeTraceProfilerInitialize( - Clang->getFrontendOpts().TimeTraceGranularity, Argv0); + Clang->getFrontendOpts().TimeTraceGranularity, Argv0, + Clang->getFrontendOpts().TimeTraceVerbose); } // --print-supported-cpus takes priority over the actual compilation. if (Clang->getFrontendOpts().PrintSupportedCPUs) diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp index 5f3950ff033f1..96e137508ed94 100644 --- a/clang/unittests/Support/TimeProfilerTest.cpp +++ b/clang/unittests/Support/TimeProfilerTest.cpp @@ -10,11 +10,14 @@ #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PreprocessorOptions.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/JSON.h" #include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/VirtualFileSystem.h" #include #include "gtest/gtest.h" +#include using namespace clang; using namespace llvm; @@ -23,7 +26,8 @@ namespace { // Should be called before testing. void setupProfiler() { - timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test"); + timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test", + /*TimeTraceVerbose=*/true); } // Should be called after `compileFromString()`. @@ -38,14 +42,24 @@ std::string teardownProfiler() { // Returns true if code compiles successfully. // We only parse AST here. This is enough for constexpr evaluation. -bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) { +bool compileFromString(StringRef Code, StringRef Standard, StringRef File, + llvm::StringMap Headers = {}) { CompilerInstance Compiler; Compiler.createDiagnostics(); + llvm::IntrusiveRefCntPtr FS( + new llvm::vfs::InMemoryFileSystem()); + FS->addFile(File, 0, MemoryBuffer::getMemBuffer(Code)); + for (const auto &Header : Headers) { + FS->addFile(Header.getKey(), 0, + MemoryBuffer::getMemBuffer(Header.getValue())); + } + llvm::IntrusiveRefCntPtr Files( + new FileManager(FileSystemOptions(), FS)); + Compiler.setFileManager(Files.get()); + auto Invocation = std::make_shared(); - Invocation->getPreprocessorOpts().addRemappedFile( - FileName, MemoryBuffer::getMemBuffer(Code).release()); - const char *Args[] = {Standard.data(), FileName.data()}; + std::vector Args = {Standard.data(), File.data()}; CompilerInvocation::CreateFromArgs(*Invocation, Args, Compiler.getDiagnostics()); Compiler.setInvocation(std::move(Invocation)); @@ -60,13 +74,27 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) { return Compiler.ExecuteAction(Action); } +std::string GetMetadata(json::Object *Event) { + std::string Metadata; + llvm::raw_string_ostream OS(Metadata); + if (json::Object *Args = Event->getObject("args")) { + if (auto Detail = Args->getString("detail")) + OS << Detail->str(); + if (auto File = Args->getString("file")) + OS << ", " << File->str(); + if (auto Line = Args->getInteger("line")) + OS << ":" << *Line; + } + return Metadata; +} + // Returns pretty-printed trace graph. std::string buildTraceGraph(StringRef Json) { struct EventRecord { int64_t TimestampBegin; int64_t TimestampEnd; - StringRef Name; - StringRef Detail; + std::string Name; + std::string Metadata; }; std::vector Events; @@ -81,10 +109,13 @@ std::string buildTraceGraph(StringRef Json) { int64_t TimestampBegin = TraceEventObj->getInteger("ts").value_or(0); int64_t TimestampEnd = TimestampBegin + TraceEventObj->getInteger("dur").value_or(0); - StringRef Name = TraceEventObj->getString("name").value_or(""); - StringRef Detail = ""; - if (json::Object *Args = TraceEventObj->getObject("args")) - Detail = Args->getString("detail").value_or(""); + std::string Name = TraceEventObj->getString("name").value_or("").str(); + std::string Metadata = GetMetadata(TraceEventObj); + + // Source events are asynchronous events and may not perfectly nest the + // synchronous events. Skip testing them. + if (Name == "Source") + continue; // This is a "summary" event, like "Total PerformPendingInstantiations", // skip it @@ -92,7 +123,7 @@ std::string buildTraceGraph(StringRef Json) { continue; Events.emplace_back( - EventRecord{TimestampBegin, TimestampEnd, Name, Detail}); + EventRecord{TimestampBegin, TimestampEnd, Name, Metadata}); } // There can be nested events that are very fast, for example: @@ -132,9 +163,9 @@ std::string buildTraceGraph(StringRef Json) { Stream << "| "; } Stream.write(Event.Name.data(), Event.Name.size()); - if (!Event.Detail.empty()) { + if (!Event.Metadata.empty()) { Stream << " ("; - Stream.write(Event.Detail.data(), Event.Detail.size()); + Stream.write(Event.Metadata.data(), Event.Metadata.size()); Stream << ")"; } Stream << "\n"; @@ -145,7 +176,7 @@ std::string buildTraceGraph(StringRef Json) { } // namespace TEST(TimeProfilerTest, ConstantEvaluationCxx20) { - constexpr StringRef Code = R"( + std::string Code = R"( void print(double value); namespace slow_namespace { @@ -175,8 +206,7 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line setupProfiler(); ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc")); std::string Json = teardownProfiler(); - std::string TraceGraph = buildTraceGraph(Json); - ASSERT_TRUE(TraceGraph == R"( + ASSERT_EQ(R"( Frontend | ParseDeclarationOrFunctionDefinition (test.cc:2:1) | ParseDeclarationOrFunctionDefinition (test.cc:6:1) @@ -202,14 +232,54 @@ Frontend | ParseDeclarationOrFunctionDefinition (test.cc:25:1) | | EvaluateAsInitializer (slow_init_list) | PerformPendingInstantiations -)"); +)", + buildTraceGraph(Json)); +} + +TEST(TimeProfilerTest, TemplateInstantiations) { + std::string B_H = R"( + template + T fooB(T t) { + return T(); + } - // NOTE: If this test is failing, run this test with - // `llvm::errs() << TraceGraph;` and change the assert above. + #define MacroTemp(x) template void foo##x(T) { T(); } + )"; + + std::string A_H = R"( + #include "b.h" + + MacroTemp(MTA) + + template + void fooA(T t) { fooB(t); fooMTA(t); } + )"; + std::string Code = R"( + #include "a.h" + void user() { fooA(0); } + )"; + + setupProfiler(); + ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc", + /*Headers=*/{{"a.h", A_H}, {"b.h", B_H}})); + std::string Json = teardownProfiler(); + ASSERT_EQ(R"( +Frontend +| ParseFunctionDefinition (fooB) +| ParseFunctionDefinition (fooMTA) +| ParseFunctionDefinition (fooA) +| ParseDeclarationOrFunctionDefinition (test.cc:3:5) +| | ParseFunctionDefinition (user) +| PerformPendingInstantiations +| | InstantiateFunction (fooA, ./a.h:7) +| | | InstantiateFunction (fooB, ./b.h:3) +| | | InstantiateFunction (fooMTA, ./a.h:4) +)", + buildTraceGraph(Json)); } TEST(TimeProfilerTest, ConstantEvaluationC99) { - constexpr StringRef Code = R"( + std::string Code = R"( struct { short quantval[4]; // 3rd line } value; @@ -218,15 +288,12 @@ struct { setupProfiler(); ASSERT_TRUE(compileFromString(Code, "-std=c99", "test.c")); std::string Json = teardownProfiler(); - std::string TraceGraph = buildTraceGraph(Json); - ASSERT_TRUE(TraceGraph == R"( + ASSERT_EQ(R"( Frontend | ParseDeclarationOrFunctionDefinition (test.c:2:1) | | isIntegerConstantExpr () | | EvaluateKnownConstIntCheckOverflow () | PerformPendingInstantiations -)"); - - // NOTE: If this test is failing, run this test with - // `llvm::errs() << TraceGraph;` and change the assert above. +)", + buildTraceGraph(Json)); } diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h index 31f7df10916db..6eb92930b36fd 100644 --- a/llvm/include/llvm/Support/TimeProfiler.h +++ b/llvm/include/llvm/Support/TimeProfiler.h @@ -83,16 +83,28 @@ namespace llvm { class raw_pwrite_stream; +struct TimeTraceMetadata { + std::string Detail; + // Source file and line number information for the event. + std::string File; + int Line; + + bool isEmpty() const { return Detail.empty() && File.empty(); } +}; + struct TimeTraceProfiler; TimeTraceProfiler *getTimeTraceProfilerInstance(); +bool isTimeTraceVerbose(); + struct TimeTraceProfilerEntry; /// Initialize the time trace profiler. /// This sets up the global \p TimeTraceProfilerInstance /// variable to be the profiler instance. void timeTraceProfilerInitialize(unsigned TimeTraceGranularity, - StringRef ProcName); + StringRef ProcName, + bool TimeTraceVerbose = false); /// Cleanup the time trace profiler, if it was initialized. void timeTraceProfilerCleanup(); @@ -128,6 +140,10 @@ TimeTraceProfilerEntry * timeTraceProfilerBegin(StringRef Name, llvm::function_ref Detail); +TimeTraceProfilerEntry * +timeTraceProfilerBegin(StringRef Name, + llvm::function_ref MetaData); + /// Manually begin a time section, with the given \p Name and \p Detail. /// This starts Async Events having \p Name as a category which is shown /// separately from other traces. See @@ -164,6 +180,11 @@ class TimeTraceScope { if (getTimeTraceProfilerInstance() != nullptr) Entry = timeTraceProfilerBegin(Name, Detail); } + TimeTraceScope(StringRef Name, + llvm::function_ref Metadata) { + if (getTimeTraceProfilerInstance() != nullptr) + Entry = timeTraceProfilerBegin(Name, Metadata); + } ~TimeTraceScope() { if (getTimeTraceProfilerInstance() != nullptr) timeTraceProfilerEnd(Entry); diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index 9612db7d30f98..c2014028ddadc 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -73,12 +73,20 @@ struct llvm::TimeTraceProfilerEntry { const TimePointType Start; TimePointType End; const std::string Name; - const std::string Detail; + TimeTraceMetadata Metadata; + const bool AsyncEvent = false; TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt, bool Ae) + : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), Metadata(), + AsyncEvent(Ae) { + Metadata.Detail = std::move(Dt); + } + + TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N, + TimeTraceMetadata &&Mt, bool Ae) : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), - Detail(std::move(Dt)), AsyncEvent(Ae) {} + Metadata(std::move(Mt)), AsyncEvent(Ae) {} // Calculate timings for FlameGraph. Cast time points to microsecond precision // rather than casting duration. This avoids truncation issues causing inner @@ -97,10 +105,12 @@ struct llvm::TimeTraceProfilerEntry { }; struct llvm::TimeTraceProfiler { - TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "") + TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "", + bool TimeTraceVerbose = false) : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()), ProcName(ProcName), Pid(sys::Process::getProcessId()), - Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) { + Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity), + TimeTraceVerbose(TimeTraceVerbose) { llvm::get_thread_name(ThreadName); } @@ -113,6 +123,15 @@ struct llvm::TimeTraceProfiler { return Stack.back().get(); } + TimeTraceProfilerEntry * + begin(std::string Name, llvm::function_ref Metadata, + bool AsyncEvent = false) { + Stack.emplace_back(std::make_unique( + ClockType::now(), TimePointType(), std::move(Name), Metadata(), + AsyncEvent)); + return Stack.back().get(); + } + void end() { assert(!Stack.empty() && "Must call begin() first"); end(*Stack.back()); @@ -184,8 +203,15 @@ struct llvm::TimeTraceProfiler { J.attribute("dur", DurUs); } J.attribute("name", E.Name); - if (!E.Detail.empty()) { - J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); + if (!E.Metadata.isEmpty()) { + J.attributeObject("args", [&] { + if (!E.Metadata.Detail.empty()) + J.attribute("detail", E.Metadata.Detail); + if (!E.Metadata.File.empty()) + J.attribute("file", E.Metadata.File); + if (E.Metadata.Line > 0) + J.attribute("line", E.Metadata.Line); + }); } }); @@ -307,14 +333,25 @@ struct llvm::TimeTraceProfiler { // Minimum time granularity (in microseconds) const unsigned TimeTraceGranularity; + + // Make time trace capture verbose event details (e.g. source filenames). This + // can increase the size of the output by 2-3 times. + const bool TimeTraceVerbose; }; +bool llvm::isTimeTraceVerbose() { + return getTimeTraceProfilerInstance() && + getTimeTraceProfilerInstance()->TimeTraceVerbose; +} + void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity, - StringRef ProcName) { + StringRef ProcName, + bool TimeTraceVerbose) { assert(TimeTraceProfilerInstance == nullptr && "Profiler should not be initialized"); TimeTraceProfilerInstance = new TimeTraceProfiler( - TimeTraceGranularity, llvm::sys::path::filename(ProcName)); + TimeTraceGranularity, llvm::sys::path::filename(ProcName), + TimeTraceVerbose); } // Removes all TimeTraceProfilerInstances. @@ -381,6 +418,14 @@ llvm::timeTraceProfilerBegin(StringRef Name, return nullptr; } +TimeTraceProfilerEntry * +llvm::timeTraceProfilerBegin(StringRef Name, + llvm::function_ref Metadata) { + if (TimeTraceProfilerInstance != nullptr) + return TimeTraceProfilerInstance->begin(std::string(Name), Metadata, false); + return nullptr; +} + TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name, StringRef Detail) { if (TimeTraceProfilerInstance != nullptr) From 1cc107234969c33a7036b9694da57f4223e3e4d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Thu, 18 Jul 2024 16:22:37 +0200 Subject: [PATCH 026/486] [GlobalIsel] Add G_SCMP and G_UCMP instructions (#98894) https://github.com/llvm/llvm-project/pull/83227 --- llvm/docs/GlobalISel/GenericOpcode.rst | 20 ++++++++++++ .../CodeGen/GlobalISel/MachineIRBuilder.h | 28 +++++++++++++++++ llvm/include/llvm/Support/TargetOpcodes.def | 6 ++++ llvm/include/llvm/Target/GenericOpcodes.td | 14 +++++++++ .../CodeGen/GlobalISel/MachineIRBuilder.cpp | 12 +++++++ llvm/lib/CodeGen/MachineVerifier.cpp | 30 ++++++++++++++++++ .../GlobalISel/legalizer-info-validation.mir | 6 ++++ llvm/test/MachineVerifier/test_uscmp.mir | 31 +++++++++++++++++++ .../match-table-cxx.td | 30 +++++++++--------- llvm/test/TableGen/GlobalISelEmitter.td | 2 +- 10 files changed, 163 insertions(+), 16 deletions(-) create mode 100644 llvm/test/MachineVerifier/test_uscmp.mir diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index 18a53a4815722..d32aeff5a69bb 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -348,6 +348,26 @@ G_ICMP Perform integer comparison producing non-zero (true) or zero (false). It's target specific whether a true value is 1, ~0U, or some other non-zero value. +G_SCMP +^^^^^^ + +Perform signed 3-way integer comparison producing -1 (smaller), 0 (equal), or 1 (larger). + +.. code-block:: none + + %5:_(s32) = G_SCMP %6, %2 + + +G_UCMP +^^^^^^ + +Perform unsigned 3-way integer comparison producing -1 (smaller), 0 (equal), or 1 (larger). + +.. code-block:: none + + %7:_(s32) = G_UCMP %2, %6 + + G_SELECT ^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index e74136f34b234..56a77b8596a18 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1273,6 +1273,34 @@ class MachineIRBuilder { const SrcOp &Op0, const SrcOp &Op1, std::optional Flags = std::nullopt); + /// Build and insert a \p Res = G_SCMP \p Op0, \p Op1 + /// + /// \pre setBasicBlock or setMI must have been called. + + /// \pre \p Res must be a generic virtual register with scalar or + /// vector type. Typically this starts as s2 or . + /// \pre \p Op0 and Op1 must be generic virtual registers with the + /// same number of elements as \p Res. If \p Res is a scalar, + /// \p Op0 must be a scalar. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildSCmp(const DstOp &Res, const SrcOp &Op0, + const SrcOp &Op1); + + /// Build and insert a \p Res = G_UCMP \p Op0, \p Op1 + /// + /// \pre setBasicBlock or setMI must have been called. + + /// \pre \p Res must be a generic virtual register with scalar or + /// vector type. Typically this starts as s2 or . + /// \pre \p Op0 and Op1 must be generic virtual registers with the + /// same number of elements as \p Res. If \p Res is a scalar, + /// \p Op0 must be a scalar. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildUCmp(const DstOp &Res, const SrcOp &Op0, + const SrcOp &Op1); + /// Build and insert a \p Res = G_IS_FPCLASS \p Src, \p Mask MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src, unsigned Mask) { diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index a6672f87af977..9fb6de49fb205 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -503,6 +503,12 @@ HANDLE_TARGET_OPCODE(G_ICMP) /// Generic floating-point comparison, also applicable to vectors. HANDLE_TARGET_OPCODE(G_FCMP) +/// Generic signed 3-way comparison. +HANDLE_TARGET_OPCODE(G_SCMP) + +/// Generic unsigned 3-way comparison. +HANDLE_TARGET_OPCODE(G_UCMP) + /// Generic select. HANDLE_TARGET_OPCODE(G_SELECT) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 7501048dfdd78..36a0a087ba457 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -430,6 +430,20 @@ def G_FCMP : GenericInstruction { let hasSideEffects = false; } +// Generic signed three-way comparison. +def G_SCMP : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src1, type1:$src2); + let hasSideEffects = false; +} + +// Generic unsigned three-way comparison. +def G_UCMP : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src1, type1:$src2); + let hasSideEffects = false; +} + // Generic select def G_SELECT : GenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 06a6c1f93ef1f..7eb6cd4e0d798 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -911,6 +911,18 @@ MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred, return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags); } +MachineInstrBuilder MachineIRBuilder::buildSCmp(const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildInstr(TargetOpcode::G_SCMP, Res, {Op0, Op1}); +} + +MachineInstrBuilder MachineIRBuilder::buildUCmp(const DstOp &Res, + const SrcOp &Op0, + const SrcOp &Op1) { + return buildInstr(TargetOpcode::G_UCMP, Res, {Op0, Op1}); +} + MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst, const SrcOp &Op0, const SrcOp &Op1, diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 0a5b8bdbc9371..d22fbe322ec36 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1544,6 +1544,36 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { break; } + case TargetOpcode::G_SCMP: + case TargetOpcode::G_UCMP: { + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + LLT SrcTy = MRI->getType(MI->getOperand(1).getReg()); + LLT SrcTy2 = MRI->getType(MI->getOperand(2).getReg()); + + if (SrcTy.isPointerOrPointerVector() || SrcTy2.isPointerOrPointerVector()) { + report("Generic scmp/ucmp does not support pointers as operands", MI); + break; + } + + if (DstTy.isPointerOrPointerVector()) { + report("Generic scmp/ucmp does not support pointers as a result", MI); + break; + } + + if ((DstTy.isVector() != SrcTy.isVector()) || + (DstTy.isVector() && + DstTy.getElementCount() != SrcTy.getElementCount())) { + report("Generic vector scmp/ucmp must preserve number of lanes", MI); + break; + } + + if (SrcTy != SrcTy2) { + report("Generic scmp/ucmp must have same input types", MI); + break; + } + + break; + } case TargetOpcode::G_EXTRACT: { const MachineOperand &SrcOp = MI->getOperand(1); if (!SrcOp.isReg()) { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 61ea3fb998374..b8da462ed78a1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -351,6 +351,12 @@ # DEBUG-NEXT: G_FCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_SELECT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/MachineVerifier/test_uscmp.mir b/llvm/test/MachineVerifier/test_uscmp.mir new file mode 100644 index 0000000000000..aa686c4ec73e6 --- /dev/null +++ b/llvm/test/MachineVerifier/test_uscmp.mir @@ -0,0 +1,31 @@ +# RUN: not --crash llc -verify-machineinstrs -run-pass none -mtriple=arm64 -o /dev/null %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target + +--- +name: test_uscmp +body: | + bb.0: + + %2:_(p0) = G_IMPLICIT_DEF + %3:_(p0) = G_IMPLICIT_DEF + ; CHECK: Generic scmp/ucmp does not support pointers as operands + %4:_(s1) = G_SCMP %2, %3 + + %12:_(s64) = G_IMPLICIT_DEF + %13:_(s64) = G_IMPLICIT_DEF + ; CHECK: Generic scmp/ucmp does not support pointers as a result + %14:_(p0) = G_SCMP %12, %13 + + %23:_(<2 x s32>) = G_IMPLICIT_DEF + %24:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: Generic vector scmp/ucmp must preserve number of lanes + %5:_(s1) = G_UCMP %23, %24 + + %15:_(s32) = G_CONSTANT i32 0 + %16:_(s64) = G_CONSTANT i64 2 + ; CHECK: Generic scmp/ucmp must have same input types + %17:_(s1) = G_SCMP %15, %16 + + + +... diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td index 7bbde818082ce..0dcd0c2dd50a8 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td @@ -86,12 +86,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { // CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2([[#LOWER:]]), GIMT_Encode2([[#UPPER:]]), /*)*//*default:*//*Label 4*/ GIMT_Encode4([[#DEFAULT:]]), -// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(410), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(428), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 2*/ GIMT_Encode4(440), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_FABS*//*Label 3*/ GIMT_Encode4(452), +// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(418), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(436), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 2*/ GIMT_Encode4(448), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_FABS*//*Label 3*/ GIMT_Encode4(460), // CHECK-NEXT: // Label 0: @[[#%u, mul(UPPER-LOWER, 4) + 10]] -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(427), // Rule ID 2 // +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(435), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: // MIs[0] x // CHECK-NEXT: // No operand predicates @@ -101,10 +101,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner1), // CHECK-NEXT: // Combiner Rule #2: TwoMatchNoApply // CHECK-NEXT: GIR_EraseRootFromParent_Done, -// CHECK-NEXT: // Label 5: @427 +// CHECK-NEXT: // Label 5: @435 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @428 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(439), // Rule ID 3 // +// CHECK-NEXT: // Label 1: @436 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(447), // Rule ID 3 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -112,10 +112,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // No operand predicates // CHECK-NEXT: // Combiner Rule #3: NoMatchTwoApply // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2), -// CHECK-NEXT: // Label 6: @439 +// CHECK-NEXT: // Label 6: @447 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @440 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(451), // Rule ID 1 // +// CHECK-NEXT: // Label 2: @448 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(459), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -123,10 +123,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // No operand predicates // CHECK-NEXT: // Combiner Rule #1: TwoMatchTwoApply // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1), -// CHECK-NEXT: // Label 7: @451 +// CHECK-NEXT: // Label 7: @459 // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @452 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(463), // Rule ID 0 // +// CHECK-NEXT: // Label 3: @460 +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(471), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -134,7 +134,7 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // No operand predicates // CHECK-NEXT: // Combiner Rule #0: OneMatchOneApply // CHECK-NEXT: GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0), -// CHECK-NEXT: // Label 8: @463 +// CHECK-NEXT: // Label 8: @471 // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: // Label 4: @[[#%u, DEFAULT]] // CHECK-NEXT: GIM_Reject, diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td index 796f595930319..853831366fa53 100644 --- a/llvm/test/TableGen/GlobalISelEmitter.td +++ b/llvm/test/TableGen/GlobalISelEmitter.td @@ -513,7 +513,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3), // R00O-NEXT: GIM_Reject, // R00O: // Label [[DEFAULT_NUM]]: @[[DEFAULT]] // R00O-NEXT: GIM_Reject, -// R00O-NEXT: }; // Size: 1808 bytes +// R00O-NEXT: }; // Size: 1816 bytes def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4), [(set GPR32:$dst, From 47b63cd508f993d9fab2acfbf0dcf86cdc8c5335 Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Thu, 18 Jul 2024 16:26:32 +0200 Subject: [PATCH 027/486] [lld-macho] Save all thin archive members in repro tarball (#97169) Previously, we only saved those members of thin archives into a repro file that were actually used during linking. However, -ObjC handling requires us to inspect all members, even those that don't end up being loaded. We weren't handling missing members correctly and crashed with an "unhandled `Error`" failure in LLVM_ENABLE_ABI_BREAKING_CHECKS builds. To fix this, we now eagerly load all object files and warn when encountering missing members (in the instances where it wasn't a hard error before). To avoid having to patch out the checks when dealing with older repro files, the `--no-warn-thin-archive-missing-members` flag is added as an escape hatch. --- lld/MachO/Config.h | 1 + lld/MachO/Driver.cpp | 43 ++++++++++++++++++-- lld/MachO/InputFiles.cpp | 4 -- lld/MachO/Options.td | 3 ++ lld/test/MachO/reproduce-thin-archive-objc.s | 25 ++++++++++++ lld/test/MachO/reproduce-thin-archives.s | 19 +++++++-- 6 files changed, 83 insertions(+), 12 deletions(-) create mode 100644 lld/test/MachO/reproduce-thin-archive-objc.s diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 5c354e0fe8821..e79812b16ec12 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -212,6 +212,7 @@ struct Configuration { bool csProfileGenerate = false; llvm::StringRef csProfilePath; bool pgoWarnMismatch; + bool warnThinArchiveMissingMembers; bool callGraphProfileSort = false; llvm::StringRef printSymbolOrder; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index ffb3feae25ca4..dc9d635b48ec4 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -270,6 +270,20 @@ struct ArchiveFileInfo { static DenseMap loadedArchives; +static void saveThinArchiveToRepro(ArchiveFile const *file) { + assert(tar && file->getArchive().isThin()); + + Error e = Error::success(); + for (const object::Archive::Child &c : file->getArchive().children(e)) { + MemoryBufferRef mb = CHECK(c.getMemoryBufferRef(), + toString(file) + ": failed to get buffer"); + tar->append(relativeToRoot(CHECK(c.getFullName(), file)), mb.getBuffer()); + } + if (e) + error(toString(file) + + ": Archive::children failed: " + toString(std::move(e))); +} + static InputFile *addFile(StringRef path, LoadType loadType, bool isLazy = false, bool isExplicit = true, bool isBundleLoader = false, @@ -301,6 +315,9 @@ static InputFile *addFile(StringRef path, LoadType loadType, if (!archive->isEmpty() && !archive->hasSymbolTable()) error(path + ": archive has no index; run ranlib to add one"); file = make(std::move(archive), isForceHidden); + + if (tar && file->getArchive().isThin()) + saveThinArchiveToRepro(file); } else { file = entry->second.file; // Command-line loads take precedence. If file is previously loaded via @@ -330,9 +347,13 @@ static InputFile *addFile(StringRef path, LoadType loadType, reason = "-all_load"; break; } - if (Error e = file->fetch(c, reason)) - error(toString(file) + ": " + reason + - " failed to load archive member: " + toString(std::move(e))); + if (Error e = file->fetch(c, reason)) { + if (config->warnThinArchiveMissingMembers) + warn(toString(file) + ": " + reason + + " failed to load archive member: " + toString(std::move(e))); + else + llvm::consumeError(std::move(e)); + } } if (e) error(toString(file) + @@ -349,7 +370,18 @@ static InputFile *addFile(StringRef path, LoadType loadType, Error e = Error::success(); for (const object::Archive::Child &c : file->getArchive().children(e)) { Expected mb = c.getMemoryBufferRef(); - if (!mb || !hasObjCSection(*mb)) + if (!mb) { + // We used to create broken repro tarballs that only included those + // object files from thin archives that ended up being used. + if (config->warnThinArchiveMissingMembers) + warn(toString(file) + ": -ObjC failed to open archive member: " + + toString(mb.takeError())); + else + llvm::consumeError(mb.takeError()); + continue; + } + + if (!hasObjCSection(*mb)) continue; if (Error e = file->fetch(c, "-ObjC")) error(toString(file) + ": -ObjC failed to load archive member: " + @@ -1699,6 +1731,9 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, config->csProfilePath = args.getLastArgValue(OPT_cs_profile_path); config->pgoWarnMismatch = args.hasFlag(OPT_pgo_warn_mismatch, OPT_no_pgo_warn_mismatch, true); + config->warnThinArchiveMissingMembers = + args.hasFlag(OPT_warn_thin_archive_missing_members, + OPT_no_warn_thin_archive_missing_members, true); config->generateUuid = !args.hasArg(OPT_no_uuid); for (const Arg *arg : args.filtered(OPT_alias)) { diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp index b40a812f30bd3..3086c9cc4729d 100644 --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -2200,10 +2200,6 @@ Error ArchiveFile::fetch(const object::Archive::Child &c, StringRef reason) { if (!mb) return mb.takeError(); - // Thin archives refer to .o files, so --reproduce needs the .o files too. - if (tar && c.getParent()->isThin()) - tar->append(relativeToRoot(CHECK(c.getFullName(), this)), mb->getBuffer()); - Expected> modTime = c.getLastModified(); if (!modTime) return modTime.takeError(); diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index dc2212399222f..bbd8bf70c3a0c 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -153,6 +153,9 @@ def cs_profile_path: Joined<["--"], "cs-profile-path=">, defm pgo_warn_mismatch: BB<"pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">, Group; +defm warn_thin_archive_missing_members : BB<"warn-thin-archive-missing-members", + "Warn on missing object files referenced by thin archives (default)", + "Do not warn on missing object files referenced by thin archives">, Group; // This is a complete Options.td compiled from Apple's ld(1) manpage // dated 2018-03-07 and cross checked with ld64 source code in repo diff --git a/lld/test/MachO/reproduce-thin-archive-objc.s b/lld/test/MachO/reproduce-thin-archive-objc.s new file mode 100644 index 0000000000000..c5fe42f130526 --- /dev/null +++ b/lld/test/MachO/reproduce-thin-archive-objc.s @@ -0,0 +1,25 @@ +# REQUIRES: x86 + +## For a long time, LLD only included those members from thin archives that were actually used +## during linking. However, we need to iterate over all members for -ObjC, check that we don't +## crash when we encounter a missing member. + +# RUN: rm -rf %t; mkdir %t +# RUN: sed s/SYM/_main/ %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o %t/main.o +# RUN: sed s/SYM/_unused/ %s | llvm-mc -filetype=obj -triple=x86_64-apple-macos -o %t/unused.o + +# RUN: cd %t; llvm-ar rcsT unused.a unused.o; rm unused.o +## FIXME: Absolute paths don't end up relativized in the repro file. + +# RUN: %no-fatal-warnings-lld %t/main.o %t/unused.a -ObjC -o /dev/null 2>&1 \ +# RUN: | FileCheck %s --check-prefix=WARN + +# RUN: %lld %t/main.o %t/unused.a -ObjC --no-warn-thin-archive-missing-members -o /dev/null \ +# RUN: | FileCheck %s --implicit-check-not 'warning' --allow-empty + +# WARN: ld64.lld: warning: {{.*}}unused.a: -ObjC failed to open archive member: 'unused.o' + +.text +.globl SYM +SYM: + ret diff --git a/lld/test/MachO/reproduce-thin-archives.s b/lld/test/MachO/reproduce-thin-archives.s index 9dee3f400e06a..33eeaede7aa41 100644 --- a/lld/test/MachO/reproduce-thin-archives.s +++ b/lld/test/MachO/reproduce-thin-archives.s @@ -1,10 +1,11 @@ # REQUIRES: x86 -# RUN: rm -rf %t.dir -# RUN: mkdir -p %t.dir -# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %s -o %t.dir/foo.o +# RUN: rm -rf %t.dir; split-file %s %t.dir + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %t.dir/foo.s -o %t.dir/foo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-macos %t.dir/unused.s -o %t.dir/unused.o # RUN: cd %t.dir -# RUN: llvm-ar rcsT foo.a foo.o +# RUN: llvm-ar rcsT foo.a foo.o unused.o # RUN: %lld foo.a -o /dev/null --reproduce repro.tar # RUN: tar tf repro.tar | FileCheck -DPATH='repro/%:t.dir' %s @@ -12,9 +13,19 @@ # RUN: %lld -all_load foo.a -o /dev/null --reproduce repro2.tar # RUN: tar tf repro2.tar | FileCheck -DPATH='repro2/%:t.dir' %s +# RUN: %lld -ObjC foo.a -o /dev/null --reproduce repro3.tar +# RUN: tar tf repro3.tar | FileCheck -DPATH='repro3/%:t.dir' %s + # CHECK-DAG: [[PATH]]/foo.a # CHECK-DAG: [[PATH]]/foo.o +# CHECK-DAG: [[PATH]]/unused.o +#--- foo.s .globl _main _main: nop + +#--- unused.s +.globl _unused +_unused: + nop From c0c4ad5d9a6e05e0b1f5f98ce2e08d479b281be8 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Thu, 18 Jul 2024 07:34:04 -0700 Subject: [PATCH 028/486] [clang][test] Split AArch64 target feature checks across multiple lines. NFC (#99365) Whenever these tests change, it's difficult to see why they don't match, and the diff after you've fixed them isn't easy to grok. By splitting them with a sed pipe, we fix both issues simultaneously. --- .../Preprocessor/aarch64-target-features.c | 351 +++++++++++++++--- 1 file changed, 303 insertions(+), 48 deletions(-) diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index 71cc36acf3f0e..d811cb36e28d8 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -291,54 +291,309 @@ // RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s // CHECK-MTUNE-CYCLONE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" -// RUN: %clang -target aarch64 -mcpu=apple-a7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s -// RUN: %clang -target aarch64 -mcpu=apple-a8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s -// RUN: %clang -target aarch64 -mcpu=apple-a9 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s -// RUN: %clang -target aarch64 -mcpu=apple-a10 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A10 %s -// RUN: %clang -target aarch64 -mcpu=apple-a11 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A11 %s -// RUN: %clang -target aarch64 -mcpu=apple-a12 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s -// RUN: %clang -target aarch64 -mcpu=apple-a13 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A13 %s -// RUN: %clang -target aarch64 -mcpu=apple-s4 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s -// RUN: %clang -target aarch64 -mcpu=apple-s5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s -// RUN: %clang -target aarch64 -mcpu=cyclone -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a34 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A34 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a35 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A35 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A53 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A57 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A72 %s -// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CORTEX-A73 %s -// RUN: %clang -target aarch64 -mcpu=cortex-r82 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CORTEX-R82 %s -// RUN: %clang -target aarch64 -mcpu=exynos-m3 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M3 %s -// RUN: %clang -target aarch64 -mcpu=exynos-m4 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M4 %s -// RUN: %clang -target aarch64 -mcpu=exynos-m5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M4 %s -// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-KRYO %s -// RUN: %clang -target aarch64 -mcpu=thunderx2t99 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-THUNDERX2T99 %s -// RUN: %clang -target aarch64 -mcpu=a64fx -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A64FX %s -// RUN: %clang -target aarch64 -mcpu=carmel -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CARMEL %s -// CHECK-MCPU-APPLE-A7: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-APPLE-A10: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lor" "-target-feature" "+neon" "-target-feature" "+pan" "-target-feature" "+perfmon" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+vh" -// CHECK-MCPU-APPLE-A11: "-cc1"{{.*}} "-triple" "aarch64{{.*}}"{{.*}}"-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" -// CHECK-MCPU-APPLE-A12: "-cc1"{{.*}} "-triple" "aarch64"{{.*}} "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" -// CHECK-MCPU-A34: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-APPLE-A13: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sha3" -// CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-CORTEX-R82: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8r" "-target-feature" "+ccdp" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+flagm" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+ssbs" -// CHECK-MCPU-M3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-M4: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" -// CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2" -// CHECK-MCPU-THUNDERX2T99: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.1a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+rdm" "-target-feature" "+sha2 -// CHECK-MCPU-A64FX: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" "-target-feature" "+sve" -// CHECK-MCPU-CARMEL: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+v8.2a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+ras" "-target-feature" "+rdm" "-target-feature" "+sha2" - -// RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s -// CHECK-ARCH-ARM64: "-target-cpu" "apple-m1" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.4a" "-target-feature" "+aes" "-target-feature" "+altnzcv" "-target-feature" "+ccdp" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+dotprod" "-target-feature" "+fp-armv8" "-target-feature" "+fp16fml" "-target-feature" "+fptoint" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+predres" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sb" "-target-feature" "+sha2" "-target-feature" "+sha3" "-target-feature" "+specrestrict" "-target-feature" "+ssbs" - -// RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s -// CHECK-ARCH-ARM64_32: "-target-cpu" "apple-s4" "-target-feature" "+zcm" "-target-feature" "+zcz" "-target-feature" "+v8.3a" "-target-feature" "+aes" "-target-feature" "+complxnum" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+fullfp16" "-target-feature" "+jsconv" "-target-feature" "+lse" "-target-feature" "+neon" "-target-feature" "+pauth" "-target-feature" "+perfmon" "-target-feature" "+ras" "-target-feature" "+rcpc" "-target-feature" "+rdm" "-target-feature" "+sha2" +// RUN: %clang -target aarch64 -mcpu=apple-a7 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s +// RUN: %clang -target aarch64 -mcpu=apple-a8 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s +// RUN: %clang -target aarch64 -mcpu=apple-a9 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s +// RUN: %clang -target aarch64 -mcpu=apple-a10 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A10 %s +// RUN: %clang -target aarch64 -mcpu=apple-a11 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A11 %s +// RUN: %clang -target aarch64 -mcpu=apple-a12 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s +// RUN: %clang -target aarch64 -mcpu=apple-a13 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A13 %s +// RUN: %clang -target aarch64 -mcpu=apple-s4 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s +// RUN: %clang -target aarch64 -mcpu=apple-s5 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A12 %s +// RUN: %clang -target aarch64 -mcpu=cyclone -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-APPLE-A7 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a34 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A34 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a35 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A35 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a53 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A53 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a57 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A57 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a72 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A72 %s +// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-CORTEX-A73 %s +// RUN: %clang -target aarch64 -mcpu=cortex-r82 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-CORTEX-R82 %s +// RUN: %clang -target aarch64 -mcpu=exynos-m3 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-M3 %s +// RUN: %clang -target aarch64 -mcpu=exynos-m4 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-M4 %s +// RUN: %clang -target aarch64 -mcpu=exynos-m5 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-M4 %s +// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-KRYO %s +// RUN: %clang -target aarch64 -mcpu=thunderx2t99 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-THUNDERX2T99 %s +// RUN: %clang -target aarch64 -mcpu=a64fx -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-A64FX %s +// RUN: %clang -target aarch64 -mcpu=carmel -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck -check-prefix=CHECK-MCPU-CARMEL %s +// CHECK-MCPU-APPLE-A7-LABEL: "-target-cpu" "apple-a7" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+zcm" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+zcz" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-APPLE-A7-NEXT: "-target-abi" +// CHECK-MCPU-APPLE-A10-LABEL: "-target-cpu" "apple-a10" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+zcm" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+zcz" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+lor" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+pan" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-feature" "+vh" +// CHECK-MCPU-APPLE-A10-NEXT: "-target-abi" +// CHECK-MCPU-APPLE-A11-LABEL: "-target-cpu" "apple-a11" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+zcm" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+zcz" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+v8.2a" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-APPLE-A11-NEXT: "-target-abi" +// CHECK-MCPU-APPLE-A12-LABEL: "-target-cpu" "apple-a12" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+zcm" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+zcz" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+v8.3a" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+complxnum" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+jsconv" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+pauth" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+rcpc" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-APPLE-A12-NEXT: "-target-abi" +// CHECK-MCPU-A34-LABEL: "-target-cpu" "cortex-a34" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A34-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A34-NEXT: "-target-abi" +// CHECK-MCPU-APPLE-A13-LABEL: "-target-cpu" "apple-a13" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+zcm" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+zcz" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+v8.4a" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+complxnum" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+dotprod" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+fp16fml" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+jsconv" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+pauth" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+rcpc" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-feature" "+sha3" +// CHECK-MCPU-APPLE-A13-NEXT: "-target-abi" +// CHECK-MCPU-A35-LABEL: "-target-cpu" "cortex-a35" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A35-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A35-NEXT: "-target-abi" +// CHECK-MCPU-A53-LABEL: "-target-cpu" "cortex-a53" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A53-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A53-NEXT: "-target-abi" +// CHECK-MCPU-A57-LABEL: "-target-cpu" "cortex-a57" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A57-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A57-NEXT: "-target-abi" +// CHECK-MCPU-A72-LABEL: "-target-cpu" "cortex-a72" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A72-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A72-NEXT: "-target-abi" +// CHECK-MCPU-CORTEX-A73-LABEL: "-target-cpu" "cortex-a73" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-CORTEX-A73-NEXT: "-target-abi" +// CHECK-MCPU-CORTEX-R82-LABEL: "-target-cpu" "cortex-r82" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+v8r" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+ccdp" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+complxnum" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+dotprod" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+flagm" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+fp16fml" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+jsconv" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+pauth" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+predres" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+rcpc" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+sb" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-feature" "+ssbs" +// CHECK-MCPU-CORTEX-R82-NEXT: "-target-abi" +// CHECK-MCPU-M3-LABEL: "-target-cpu" "exynos-m3" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-M3-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-M3-NEXT: "-target-abi" +// CHECK-MCPU-M4-LABEL: "-target-cpu" "exynos-m{{[45]}}" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+v8.2a" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+dotprod" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-M4-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-M4-NEXT: "-target-abi" +// CHECK-MCPU-KRYO-LABEL: "-target-cpu" "kryo" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+v8a" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-KRYO-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-KRYO-NEXT: "-target-abi" +// CHECK-MCPU-THUNDERX2T99-LABEL: "-target-cpu" "thunderx2t99" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+v8.1a" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-feature" "+sha2 +// CHECK-MCPU-THUNDERX2T99-NEXT: "-target-abi" +// CHECK-MCPU-A64FX-LABEL: "-target-cpu" "a64fx" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+v8.2a" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+complxnum" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+perfmon" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-A64FX-NEXT: "-target-feature" "+sve" +// CHECK-MCPU-A64FX-NEXT: "-target-abi" +// CHECK-MCPU-CARMEL-LABEL: "-target-cpu" "carmel" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+v8.2a" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+aes" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+crc" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+fp-armv8" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+fullfp16" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+lse" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+neon" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+ras" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+rdm" +// CHECK-MCPU-CARMEL-NEXT: "-target-feature" "+sha2" +// CHECK-MCPU-CARMEL-NEXT: "-target-abi" + + +// RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s +// CHECK-ARCH-ARM64-LABEL: "-target-cpu" "apple-m1" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+zcm" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+zcz" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+v8.4a" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+aes" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+altnzcv" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+ccdp" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+complxnum" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+crc" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+dotprod" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+fp-armv8" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+fp16fml" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+fptoint" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+fullfp16" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+jsconv" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+lse" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+neon" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+pauth" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+perfmon" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+predres" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+ras" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+rcpc" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+rdm" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+sb" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+sha2" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+sha3" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+specrestrict" +// CHECK-ARCH-ARM64-NEXT: "-target-feature" "+ssbs" +// CHECK-ARCH-ARM64-NEXT: "-target-abi" + +// RUN: %clang -target x86_64-apple-macosx -arch arm64_32 -### -c %s 2>&1 | sed -e 's/"-/\n"-/g' | FileCheck --check-prefix=CHECK-ARCH-ARM64_32 %s +// CHECK-ARCH-ARM64_32-LABEL: "-target-cpu" "apple-s4" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+zcm" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+zcz" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+v8.3a" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+aes" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+complxnum" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+crc" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+fp-armv8" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+fullfp16" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+jsconv" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+lse" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+neon" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+pauth" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+perfmon" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+ras" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+rcpc" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+rdm" +// CHECK-ARCH-ARM64_32-NEXT: "-target-feature" "+sha2" +// CHECK-ARCH-ARM64_32-NEXT: "-target-abi" // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s From 257a0d535ac052a4eb1bb847605eff1eb169087d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 14:40:55 +0200 Subject: [PATCH 029/486] [clang][Interp] Diagnose out-of-range casts to enum types --- clang/lib/AST/Interp/Compiler.cpp | 10 ++++ clang/lib/AST/Interp/Interp.cpp | 26 +++++++++ clang/lib/AST/Interp/Interp.h | 14 +++++ clang/lib/AST/Interp/Opcodes.td | 7 +++ clang/test/AST/Interp/cxx11.cpp | 90 +++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+) diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp index 28c4ffd071862..24140b23c1f0b 100644 --- a/clang/lib/AST/Interp/Compiler.cpp +++ b/clang/lib/AST/Interp/Compiler.cpp @@ -466,6 +466,16 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { if (!this->visit(SubExpr)) return false; + // Possibly diagnose casts to enum types if the target type does not + // have a fixed size. + if (CE->getType()->isEnumeralType()) { + if (const auto *ET = CE->getType().getCanonicalType()->getAs(); + ET && !ET->getDecl()->isFixed()) { + if (!this->emitCheckEnumValue(*FromT, ET->getDecl(), CE)) + return false; + } + } + if (ToT == PT_IntAP) return this->emitCastAP(*FromT, Ctx.getBitWidth(CE->getType()), CE); if (ToT == PT_IntAPS) diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index e6e9298982887..cd6fc60400ebd 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -22,6 +22,7 @@ #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "llvm/ADT/APSInt.h" +#include "llvm/ADT/StringExtras.h" #include #include @@ -899,6 +900,31 @@ bool RunDestructors(InterpState &S, CodePtr OpPC, const Block *B) { return runRecordDestructor(S, OpPC, Pointer(const_cast(B)), Desc); } +void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED, + const APSInt &Value) { + llvm::APInt Min; + llvm::APInt Max; + + if (S.EvaluatingDecl && !S.EvaluatingDecl->isConstexpr()) + return; + + ED->getValueRange(Max, Min); + --Max; + + if (ED->getNumNegativeBits() && + (Max.slt(Value.getSExtValue()) || Min.sgt(Value.getSExtValue()))) { + const SourceLocation &Loc = S.Current->getLocation(OpPC); + S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range) + << llvm::toString(Value, 10) << Min.getSExtValue() << Max.getSExtValue() + << ED; + } else if (!ED->getNumNegativeBits() && Max.ult(Value.getZExtValue())) { + const SourceLocation &Loc = S.Current->getLocation(OpPC); + S.report(Loc, diag::warn_constexpr_unscoped_enum_out_of_range) + << llvm::toString(Value, 10) << Min.getZExtValue() << Max.getZExtValue() + << ED; + } +} + bool Interpret(InterpState &S, APValue &Result) { // The current stack frame when we started Interpret(). // This is being used by the ops to determine wheter diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 2e159012f5ffd..f86b787fb034e 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2774,6 +2774,20 @@ inline bool CheckNonNullArg(InterpState &S, CodePtr OpPC) { return false; } +void diagnoseEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED, + const APSInt &Value); + +template ::T> +inline bool CheckEnumValue(InterpState &S, CodePtr OpPC, const EnumDecl *ED) { + assert(ED); + assert(!ED->isFixed()); + const APSInt Val = S.Stk.peek().toAPSInt(); + + if (S.inConstantContext()) + diagnoseEnumValue(S, OpPC, ED, Val); + return true; +} + /// OldPtr -> Integer -> NewPtr. template inline bool DecayPtr(InterpState &S, CodePtr OpPC) { diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index 3e69098570bd7..49ebb156ab2fb 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -66,6 +66,7 @@ def ArgDecl : ArgType { let Name = "const Decl*"; } def ArgVarDecl : ArgType { let Name = "const VarDecl*"; } def ArgDesc : ArgType { let Name = "const Descriptor *"; } def ArgPrimType : ArgType { let Name = "PrimType"; } +def ArgEnumDecl : ArgType { let Name = "const EnumDecl *"; } //===----------------------------------------------------------------------===// // Classes of types instructions operate on. @@ -389,6 +390,12 @@ def CheckDecl : Opcode { let Args = [ArgVarDecl]; } +def CheckEnumValue : Opcode { + let Args = [ArgEnumDecl]; + let Types = [FixedSizeIntegralTypeClass]; + let HasGroup = 1; +} + // [] -> [Value] def GetGlobal : AccessOpcode; def GetGlobalUnchecked : AccessOpcode; diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp index 82b2727bbadbb..c0b88f0e567e0 100644 --- a/clang/test/AST/Interp/cxx11.cpp +++ b/clang/test/AST/Interp/cxx11.cpp @@ -62,3 +62,93 @@ namespace ReferenceToConst { } }; } + + + +namespace GH50055 { +// Enums without fixed underlying type +enum E1 {e11=-4, e12=4}; +enum E2 {e21=0, e22=4}; +enum E3 {e31=-4, e32=1024}; +enum E4 {e41=0}; +// Empty but as-if it had a single enumerator with value 0 +enum EEmpty {}; + +// Enum with fixed underlying type because the underlying type is explicitly specified +enum EFixed : int {efixed1=-4, efixed2=4}; +// Enum with fixed underlying type because it is scoped +enum class EScoped {escoped1=-4, escoped2=4}; + +enum EMaxInt {emaxint1=-1, emaxint2=__INT_MAX__}; + +enum NumberType {}; + +E2 testDefaultArgForParam(E2 e2Param = (E2)-1) { // ok, not a constant expression context + E2 e2LocalInit = e2Param; // ok, not a constant expression context + return e2LocalInit; +} + +// #include + +void testValueInRangeOfEnumerationValues() { + constexpr E1 x1 = static_cast(-8); + constexpr E1 x2 = static_cast(8); + // both-error@-1 {{integer value 8 is outside the valid range of values [-8, 7] for the enumeration type 'E1'}} + E1 x2b = static_cast(8); // ok, not a constant expression context + + constexpr E2 x3 = static_cast(-8); + // both-error@-1 {{integer value -8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + constexpr E2 x4 = static_cast(0); + constexpr E2 x5 = static_cast(8); + // both-error@-1 {{integer value 8 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} + + constexpr E3 x6 = static_cast(-2048); + constexpr E3 x7 = static_cast(-8); + constexpr E3 x8 = static_cast(0); + constexpr E3 x9 = static_cast(8); + constexpr E3 x10 = static_cast(2048); + // both-error@-1 {{integer value 2048 is outside the valid range of values [-2048, 2047] for the enumeration type 'E3'}} + + constexpr E4 x11 = static_cast(0); + constexpr E4 x12 = static_cast(1); + constexpr E4 x13 = static_cast(2); + // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'E4'}} + + constexpr EEmpty x14 = static_cast(0); + constexpr EEmpty x15 = static_cast(1); + constexpr EEmpty x16 = static_cast(2); + // both-error@-1 {{integer value 2 is outside the valid range of values [0, 1] for the enumeration type 'EEmpty'}} + + constexpr EFixed x17 = static_cast(100); + constexpr EScoped x18 = static_cast(100); + + constexpr EMaxInt x19 = static_cast(__INT_MAX__-1); + constexpr EMaxInt x20 = static_cast((long)__INT_MAX__+1); + // both-error@-1 {{integer value 2147483648 is outside the valid range of values [-2147483648, 2147483647] for the enumeration type 'EMaxInt'}} + + const NumberType neg_one = (NumberType) ((NumberType) 0 - (NumberType) 1); // ok, not a constant expression context +} + +template struct Bitfield { + static constexpr T max = static_cast((1 << size) - 1); // #enum +}; + +void testValueInRangeOfEnumerationValuesViaTemplate() { + Bitfield good; + Bitfield bad; // both-error@#enum {{integer value 15 is outside the valid range of values [0, 7] for the enumeration type 'E2'}} +} + +enum SortOrder { + AscendingOrder, + DescendingOrder +}; + +class A { + static void f(SortOrder order); +}; + +void A::f(SortOrder order) { + if (order == SortOrder(-1)) // ok, not a constant expression context + return; +} +} From 7d74ca9513a3fa53b482230c20b1977a1f3d121b Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 18 Jul 2024 16:48:21 +0200 Subject: [PATCH 030/486] [ValueLattice] Support constant vectors in mergeIn() (#99466) This is a followup to vector support in LVI/CVP/SCCP. In mergeIn(), if one of the operands is a vector of integer constant, we should try to convert it into a constant range, in case that allows performing a range union to something better than overdefined. --- llvm/include/llvm/Analysis/ValueLattice.h | 34 ++++++++++++------- .../CorrelatedValuePropagation/vectors.ll | 4 +-- llvm/test/Transforms/SCCP/phis.ll | 4 +-- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueLattice.h b/llvm/include/llvm/Analysis/ValueLattice.h index b81eb5f60ab7e..fa56d838a3859 100644 --- a/llvm/include/llvm/Analysis/ValueLattice.h +++ b/llvm/include/llvm/Analysis/ValueLattice.h @@ -281,18 +281,21 @@ class ValueLatticeElement { return std::nullopt; } - ConstantRange asConstantRange(Type *Ty, bool UndefAllowed = false) const { - assert(Ty->isIntOrIntVectorTy() && "Must be integer type"); + ConstantRange asConstantRange(unsigned BW, bool UndefAllowed = false) const { if (isConstantRange(UndefAllowed)) return getConstantRange(); if (isConstant()) return getConstant()->toConstantRange(); - unsigned BW = Ty->getScalarSizeInBits(); if (isUnknown()) return ConstantRange::getEmpty(BW); return ConstantRange::getFull(BW); } + ConstantRange asConstantRange(Type *Ty, bool UndefAllowed = false) const { + assert(Ty->isIntOrIntVectorTy() && "Must be integer type"); + return asConstantRange(Ty->getScalarSizeInBits(), UndefAllowed); + } + bool markOverdefined() { if (isOverdefined()) return false; @@ -384,7 +387,9 @@ class ValueLatticeElement { return true; } - assert(isUnknown() || isUndef()); + assert(isUnknown() || isUndef() || isConstant()); + assert((!isConstant() || NewR.contains(getConstant()->toConstantRange())) && + "Constant must be subset of new range"); NumRangeExtensions = 0; Tag = NewTag; @@ -426,6 +431,16 @@ class ValueLatticeElement { return false; if (RHS.isUndef()) return false; + // If the constant is a vector of integers, try to treat it as a range. + if (getConstant()->getType()->isVectorTy() && + getConstant()->getType()->getScalarType()->isIntegerTy()) { + ConstantRange L = getConstant()->toConstantRange(); + ConstantRange NewR = L.unionWith( + RHS.asConstantRange(L.getBitWidth(), /*UndefAllowed=*/true)); + return markConstantRange( + std::move(NewR), + Opts.setMayIncludeUndef(RHS.isConstantRangeIncludingUndef())); + } markOverdefined(); return true; } @@ -444,14 +459,9 @@ class ValueLatticeElement { return OldTag != Tag; } - if (!RHS.isConstantRange()) { - // We can get here if we've encountered a constantexpr of integer type - // and merge it with a constantrange. - markOverdefined(); - return true; - } - - ConstantRange NewR = getConstantRange().unionWith(RHS.getConstantRange()); + const ConstantRange &L = getConstantRange(); + ConstantRange NewR = L.unionWith( + RHS.asConstantRange(L.getBitWidth(), /*UndefAllowed=*/true)); return markConstantRange( std::move(NewR), Opts.setMayIncludeUndef(RHS.isConstantRangeIncludingUndef())); diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll index 43e680cd25cdb..6254b54d42554 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll @@ -286,7 +286,7 @@ define <2 x i16> @phi_merge1(i1 %c, <2 x i8> %a) { ; CHECK-NEXT: br label %[[JOIN]] ; CHECK: [[JOIN]]: ; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ , %[[IF]] ] -; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], ; CHECK-NEXT: ret <2 x i16> [[ADD]] ; entry: @@ -312,7 +312,7 @@ define <2 x i16> @phi_merge2(i1 %c, <2 x i8> %a) { ; CHECK-NEXT: br label %[[JOIN]] ; CHECK: [[JOIN]]: ; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ , %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ] -; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], ; CHECK-NEXT: ret <2 x i16> [[ADD]] ; entry: diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll index 83daae0a7c0c8..9264a6eaefb85 100644 --- a/llvm/test/Transforms/SCCP/phis.ll +++ b/llvm/test/Transforms/SCCP/phis.ll @@ -109,7 +109,7 @@ define <2 x i16> @phi_vector_merge1(i1 %c, <2 x i8> %a) { ; CHECK-NEXT: br label %[[JOIN]] ; CHECK: [[JOIN]]: ; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ [[ZEXT]], %[[ENTRY]] ], [ , %[[IF]] ] -; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], ; CHECK-NEXT: ret <2 x i16> [[ADD]] ; entry: @@ -135,7 +135,7 @@ define <2 x i16> @phi_vector_merge2(i1 %c, <2 x i8> %a) { ; CHECK-NEXT: br label %[[JOIN]] ; CHECK: [[JOIN]]: ; CHECK-NEXT: [[PHI:%.*]] = phi <2 x i16> [ , %[[ENTRY]] ], [ [[ZEXT]], %[[IF]] ] -; CHECK-NEXT: [[ADD:%.*]] = add <2 x i16> [[PHI]], +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i16> [[PHI]], ; CHECK-NEXT: ret <2 x i16> [[ADD]] ; entry: From 9711f6bda1363eb3b8850ee67958ab90357db006 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 18 Jul 2024 16:47:27 +0200 Subject: [PATCH 031/486] [GVN] Add additional tests for pointer replacement (NFC) --- llvm/test/Transforms/GVN/condprop.ll | 195 +++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll index 6402a23157729..02f4bb97d7ebd 100644 --- a/llvm/test/Transforms/GVN/condprop.ll +++ b/llvm/test/Transforms/GVN/condprop.ll @@ -804,5 +804,200 @@ bb5: br label %bb5 } +define void @select_same_obj(i1 %c, ptr %p, i64 %x) { +; CHECK-LABEL: @select_same_obj( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[X:%.*]] +; CHECK-NEXT: [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P]], ptr [[P2]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @use_ptr(ptr [[P]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %p2 = getelementptr i8, ptr %p, i64 %x + %p3 = select i1 %c, ptr %p, ptr %p2 + %cmp = icmp eq ptr %p, %p3 + br i1 %cmp, label %if, label %exit + +if: + call void @use_ptr(ptr %p) + call void @use_ptr(ptr %p3) + ret void + +exit: + ret void +} + +define void @select_different_obj(i1 %c, ptr %p, ptr %p2) { +; CHECK-LABEL: @select_different_obj( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @use_ptr(ptr [[P]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %p3 = select i1 %c, ptr %p, ptr %p2 + %cmp = icmp eq ptr %p, %p3 + br i1 %cmp, label %if, label %exit + +if: + call void @use_ptr(ptr %p) + call void @use_ptr(ptr %p3) + ret void + +exit: + ret void +} + +define void @select_same_obj_is_select(i1 %c, ptr %p, ptr %p2, i64 %x) { +; CHECK-LABEL: @select_same_obj_is_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P3:%.*]] = select i1 [[C:%.*]], ptr [[P:%.*]], ptr [[P2:%.*]] +; CHECK-NEXT: [[P4:%.*]] = getelementptr i8, ptr [[P3]], i64 [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P3]], [[P4]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[EXIT:%.*]] +; CHECK: if: +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %p3 = select i1 %c, ptr %p, ptr %p2 + %p4 = getelementptr i8, ptr %p3, i64 %x + %cmp = icmp eq ptr %p3, %p4 + br i1 %cmp, label %if, label %exit + +if: + call void @use_ptr(ptr %p3) + call void @use_ptr(ptr %p4) + ret void + +exit: + ret void +} + +define void @phi_same_obj(i1 %c, ptr %p, i64 %x) { +; CHECK-LABEL: @phi_same_obj( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[X:%.*]] +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[P3:%.*]] = phi ptr [ [[P]], [[IF]] ], [ [[P2]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF2:%.*]], label [[EXIT:%.*]] +; CHECK: if2: +; CHECK-NEXT: call void @use_ptr(ptr [[P]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %p2 = getelementptr i8, ptr %p, i64 %x + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %p3 = phi ptr [ %p, %if ], [ %p2, %entry ] + %cmp = icmp eq ptr %p, %p3 + br i1 %cmp, label %if2, label %exit + +if2: + call void @use_ptr(ptr %p) + call void @use_ptr(ptr %p3) + ret void + +exit: + ret void +} + +define void @phi_different_obj(i1 %c, ptr %p, ptr %p2) { +; CHECK-LABEL: @phi_different_obj( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[JOIN]] +; CHECK: join: +; CHECK-NEXT: [[P3:%.*]] = phi ptr [ [[P:%.*]], [[IF]] ], [ [[P2:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], [[P3]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF2:%.*]], label [[EXIT:%.*]] +; CHECK: if2: +; CHECK-NEXT: call void @use_ptr(ptr [[P]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P3]]) +; CHECK-NEXT: ret void +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %if, label %join + +if: + br label %join + +join: + %p3 = phi ptr [ %p, %if ], [ %p2, %entry ] + %cmp = icmp eq ptr %p, %p3 + br i1 %cmp, label %if2, label %exit + +if2: + call void @use_ptr(ptr %p) + call void @use_ptr(ptr %p3) + ret void + +exit: + ret void +} + +define void @phi_same_obj_cycle(i1 %c, ptr %p, i64 %x) { +; CHECK-LABEL: @phi_same_obj_cycle( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[P_IV:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY:%.*]] ], [ [[P_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[P_NEXT]] = getelementptr i8, ptr [[P_IV]], i64 [[X:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[P_IV]], [[P]] +; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[LOOP_LATCH]] +; CHECK: if: +; CHECK-NEXT: call void @use_ptr(ptr [[P_IV]]) +; CHECK-NEXT: call void @use_ptr(ptr [[P]]) +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop + +loop: + %p.iv = phi ptr [ %p, %entry ], [ %p.next, %loop.latch ] + %p.next = getelementptr i8, ptr %p.iv, i64 %x + %cmp = icmp eq ptr %p.iv, %p + br i1 %cmp, label %if, label %loop.latch + +if: + call void @use_ptr(ptr %p.iv) + call void @use_ptr(ptr %p) + br label %loop.latch + +loop.latch: + br label %loop +} + declare void @use_bool(i1) declare void @use_ptr(ptr) From f6b06b42a3f4f59ff33da20d42358f2768eaf726 Mon Sep 17 00:00:00 2001 From: Akira Hatanaka Date: Thu, 18 Jul 2024 07:51:17 -0700 Subject: [PATCH 032/486] [PAC] Implement function pointer re-signing (#98847) Re-signing occurs when function type discrimination is enabled and a function pointer is converted to another function pointer type that requires signing using a different discriminator. A function pointer is re-signed using discriminator zero when it's converted to a pointer to a non-function type such as `void*`. --------- Co-authored-by: Ahmed Bougacha Co-authored-by: John McCall --- clang/lib/CodeGen/Address.h | 42 +++- clang/lib/CodeGen/CGBuilder.h | 4 +- clang/lib/CodeGen/CGExpr.cpp | 3 +- clang/lib/CodeGen/CGExprScalar.cpp | 7 +- clang/lib/CodeGen/CGPointerAuth.cpp | 230 ++++++++++++++++++ clang/lib/CodeGen/CGValue.h | 27 +- clang/lib/CodeGen/CodeGenFunction.cpp | 30 ++- clang/lib/CodeGen/CodeGenFunction.h | 35 ++- clang/lib/Headers/ptrauth.h | 15 ++ .../ptrauth-function-lvalue-cast-disc.c | 68 ++++++ ...ptrauth-function-type-discriminator-cast.c | 94 +++++++ 11 files changed, 513 insertions(+), 42 deletions(-) create mode 100644 clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c create mode 100644 clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h index 35ec370a139c9..1c4d2e103b5e7 100644 --- a/clang/lib/CodeGen/Address.h +++ b/clang/lib/CodeGen/Address.h @@ -14,6 +14,7 @@ #ifndef LLVM_CLANG_LIB_CODEGEN_ADDRESS_H #define LLVM_CLANG_LIB_CODEGEN_ADDRESS_H +#include "CGPointerAuthInfo.h" #include "clang/AST/CharUnits.h" #include "clang/AST/Type.h" #include "llvm/ADT/PointerIntPair.h" @@ -108,6 +109,22 @@ class RawAddress { /// Like RawAddress, an abstract representation of an aligned address, but the /// pointer contained in this class is possibly signed. +/// +/// This is designed to be an IR-level abstraction, carrying just the +/// information necessary to perform IR operations on an address like loads and +/// stores. In particular, it doesn't carry C type information or allow the +/// representation of things like bit-fields; clients working at that level +/// should generally be using `LValue`. +/// +/// An address may be either *raw*, meaning that it's an ordinary machine +/// pointer, or *signed*, meaning that the pointer carries an embedded +/// pointer-authentication signature. Representing signed pointers directly in +/// this abstraction allows the authentication to be delayed as long as possible +/// without forcing IRGen to use totally different code paths for signed and +/// unsigned values or to separately propagate signature information through +/// every API that manipulates addresses. Pointer arithmetic on signed addresses +/// (e.g. drilling down to a struct field) is accumulated into a separate offset +/// which is applied when the address is finally accessed. class Address { friend class CGBuilderTy; @@ -121,7 +138,11 @@ class Address { CharUnits Alignment; - /// Offset from the base pointer. + /// The ptrauth information needed to authenticate the base pointer. + CGPointerAuthInfo PtrAuthInfo; + + /// Offset from the base pointer. This is non-null only when the base + /// pointer is signed. llvm::Value *Offset = nullptr; llvm::Value *emitRawPointerSlow(CodeGenFunction &CGF) const; @@ -140,12 +161,14 @@ class Address { } Address(llvm::Value *BasePtr, llvm::Type *ElementType, CharUnits Alignment, - llvm::Value *Offset, KnownNonNull_t IsKnownNonNull = NotKnownNonNull) + CGPointerAuthInfo PtrAuthInfo, llvm::Value *Offset, + KnownNonNull_t IsKnownNonNull = NotKnownNonNull) : Pointer(BasePtr, IsKnownNonNull), ElementType(ElementType), - Alignment(Alignment), Offset(Offset) {} + Alignment(Alignment), PtrAuthInfo(PtrAuthInfo), Offset(Offset) {} Address(RawAddress RawAddr) - : Pointer(RawAddr.isValid() ? RawAddr.getPointer() : nullptr), + : Pointer(RawAddr.isValid() ? RawAddr.getPointer() : nullptr, + RawAddr.isValid() ? RawAddr.isKnownNonNull() : NotKnownNonNull), ElementType(RawAddr.isValid() ? RawAddr.getElementType() : nullptr), Alignment(RawAddr.isValid() ? RawAddr.getAlignment() : CharUnits::Zero()) {} @@ -192,6 +215,9 @@ class Address { /// Return the IR name of the pointer value. llvm::StringRef getName() const { return Pointer.getPointer()->getName(); } + const CGPointerAuthInfo &getPointerAuthInfo() const { return PtrAuthInfo; } + void setPointerAuthInfo(const CGPointerAuthInfo &Info) { PtrAuthInfo = Info; } + // This function is called only in CGBuilderBaseTy::CreateElementBitCast. void setElementType(llvm::Type *Ty) { assert(hasOffset() && @@ -199,6 +225,8 @@ class Address { ElementType = Ty; } + bool isSigned() const { return PtrAuthInfo.isSigned(); } + /// Whether the pointer is known not to be null. KnownNonNull_t isKnownNonNull() const { assert(isValid()); @@ -215,6 +243,9 @@ class Address { llvm::Value *getOffset() const { return Offset; } + Address getResignedAddress(const CGPointerAuthInfo &NewInfo, + CodeGenFunction &CGF) const; + /// Return the pointer contained in this class after authenticating it and /// adding offset to it if necessary. llvm::Value *emitRawPointer(CodeGenFunction &CGF) const { @@ -240,7 +271,8 @@ class Address { /// alignment. Address withElementType(llvm::Type *ElemTy) const { if (!hasOffset()) - return Address(getBasePointer(), ElemTy, getAlignment(), nullptr, + return Address(getBasePointer(), ElemTy, getAlignment(), + getPointerAuthInfo(), /*Offset=*/nullptr, isKnownNonNull()); Address A(*this); A.ElementType = ElemTy; diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h index 0bc4fda62979c..5d59d5a4ae2c1 100644 --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -190,8 +190,8 @@ class CGBuilderTy : public CGBuilderBaseTy { const llvm::Twine &Name = "") { if (!Addr.hasOffset()) return Address(CreateAddrSpaceCast(Addr.getBasePointer(), Ty, Name), - ElementTy, Addr.getAlignment(), nullptr, - Addr.isKnownNonNull()); + ElementTy, Addr.getAlignment(), Addr.getPointerAuthInfo(), + /*Offset=*/nullptr, Addr.isKnownNonNull()); // Eagerly force a raw address if these is an offset. return RawAddress( CreateAddrSpaceCast(Addr.emitRawPointer(*getCGF()), Ty, Name), diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 5fdd3cc490e59..6a0af00b9e186 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1312,7 +1312,8 @@ static Address EmitPointerWithAlignment(const Expr *E, LValueBaseInfo *BaseInfo, if (CE->getCastKind() == CK_AddressSpaceConversion) Addr = CGF.Builder.CreateAddrSpaceCast( Addr, CGF.ConvertType(E->getType()), ElemTy); - return Addr; + return CGF.authPointerToPointerCast(Addr, CE->getSubExpr()->getType(), + CE->getType()); } break; diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 084dc54537eb7..a17d68424bbce 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2374,7 +2374,9 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { DestLV.setTBAAInfo(TBAAAccessInfo::getMayAliasInfo()); return EmitLoadOfLValue(DestLV, CE->getExprLoc()); } - return Builder.CreateBitCast(Src, DstTy); + + llvm::Value *Result = Builder.CreateBitCast(Src, DstTy); + return CGF.authPointerToPointerCast(Result, E->getType(), DestTy); } case CK_AddressSpaceConversion: { Expr::EvalResult Result; @@ -2524,6 +2526,8 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { if (DestTy.mayBeDynamicClass()) IntToPtr = Builder.CreateLaunderInvariantGroup(IntToPtr); } + + IntToPtr = CGF.authPointerToPointerCast(IntToPtr, E->getType(), DestTy); return IntToPtr; } case CK_PointerToIntegral: { @@ -2539,6 +2543,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { PtrExpr = Builder.CreateStripInvariantGroup(PtrExpr); } + PtrExpr = CGF.authPointerToPointerCast(PtrExpr, E->getType(), DestTy); return Builder.CreatePtrToInt(PtrExpr, ConvertType(DestTy)); } case CK_ToVoid: { diff --git a/clang/lib/CodeGen/CGPointerAuth.cpp b/clang/lib/CodeGen/CGPointerAuth.cpp index 621d567dde721..7fe62c0788742 100644 --- a/clang/lib/CodeGen/CGPointerAuth.cpp +++ b/clang/lib/CodeGen/CGPointerAuth.cpp @@ -15,6 +15,7 @@ #include "CodeGenModule.h" #include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/SipHash.h" using namespace clang; @@ -165,6 +166,128 @@ CGPointerAuthInfo CodeGenModule::getPointerAuthInfoForType(QualType T) { return ::getPointerAuthInfoForType(*this, T); } +static bool isZeroConstant(const llvm::Value *Value) { + if (const auto *CI = dyn_cast(Value)) + return CI->isZero(); + return false; +} + +static bool equalAuthPolicies(const CGPointerAuthInfo &Left, + const CGPointerAuthInfo &Right) { + assert((Left.isSigned() || Right.isSigned()) && + "shouldn't be called if neither is signed"); + if (Left.isSigned() != Right.isSigned()) + return false; + return Left.getKey() == Right.getKey() && + Left.getAuthenticationMode() == Right.getAuthenticationMode(); +} + +// Return the discriminator or return zero if the discriminator is null. +static llvm::Value *getDiscriminatorOrZero(const CGPointerAuthInfo &Info, + CGBuilderTy &Builder) { + llvm::Value *Discriminator = Info.getDiscriminator(); + return Discriminator ? Discriminator : Builder.getSize(0); +} + +llvm::Value * +CodeGenFunction::emitPointerAuthResignCall(llvm::Value *Value, + const CGPointerAuthInfo &CurAuth, + const CGPointerAuthInfo &NewAuth) { + assert(CurAuth && NewAuth); + + if (CurAuth.getAuthenticationMode() != + PointerAuthenticationMode::SignAndAuth || + NewAuth.getAuthenticationMode() != + PointerAuthenticationMode::SignAndAuth) { + llvm::Value *AuthedValue = EmitPointerAuthAuth(CurAuth, Value); + return EmitPointerAuthSign(NewAuth, AuthedValue); + } + // Convert the pointer to intptr_t before signing it. + auto *OrigType = Value->getType(); + Value = Builder.CreatePtrToInt(Value, IntPtrTy); + + auto *CurKey = Builder.getInt32(CurAuth.getKey()); + auto *NewKey = Builder.getInt32(NewAuth.getKey()); + + llvm::Value *CurDiscriminator = getDiscriminatorOrZero(CurAuth, Builder); + llvm::Value *NewDiscriminator = getDiscriminatorOrZero(NewAuth, Builder); + + // call i64 @llvm.ptrauth.resign(i64 %pointer, + // i32 %curKey, i64 %curDiscriminator, + // i32 %newKey, i64 %newDiscriminator) + auto *Intrinsic = CGM.getIntrinsic(llvm::Intrinsic::ptrauth_resign); + Value = EmitRuntimeCall( + Intrinsic, {Value, CurKey, CurDiscriminator, NewKey, NewDiscriminator}); + + // Convert back to the original type. + Value = Builder.CreateIntToPtr(Value, OrigType); + return Value; +} + +llvm::Value *CodeGenFunction::emitPointerAuthResign( + llvm::Value *Value, QualType Type, const CGPointerAuthInfo &CurAuthInfo, + const CGPointerAuthInfo &NewAuthInfo, bool IsKnownNonNull) { + // Fast path: if neither schema wants a signature, we're done. + if (!CurAuthInfo && !NewAuthInfo) + return Value; + + llvm::Value *Null = nullptr; + // If the value is obviously null, we're done. + if (auto *PointerValue = dyn_cast(Value->getType())) { + Null = CGM.getNullPointer(PointerValue, Type); + } else { + assert(Value->getType()->isIntegerTy()); + Null = llvm::ConstantInt::get(IntPtrTy, 0); + } + if (Value == Null) + return Value; + + // If both schemas sign the same way, we're done. + if (equalAuthPolicies(CurAuthInfo, NewAuthInfo)) { + const llvm::Value *CurD = CurAuthInfo.getDiscriminator(); + const llvm::Value *NewD = NewAuthInfo.getDiscriminator(); + if (CurD == NewD) + return Value; + + if ((CurD == nullptr && isZeroConstant(NewD)) || + (NewD == nullptr && isZeroConstant(CurD))) + return Value; + } + + llvm::BasicBlock *InitBB = Builder.GetInsertBlock(); + llvm::BasicBlock *ResignBB = nullptr, *ContBB = nullptr; + + // Null pointers have to be mapped to null, and the ptrauth_resign + // intrinsic doesn't do that. + if (!IsKnownNonNull && !llvm::isKnownNonZero(Value, CGM.getDataLayout())) { + ContBB = createBasicBlock("resign.cont"); + ResignBB = createBasicBlock("resign.nonnull"); + + auto *IsNonNull = Builder.CreateICmpNE(Value, Null); + Builder.CreateCondBr(IsNonNull, ResignBB, ContBB); + EmitBlock(ResignBB); + } + + // Perform the auth/sign/resign operation. + if (!NewAuthInfo) + Value = EmitPointerAuthAuth(CurAuthInfo, Value); + else if (!CurAuthInfo) + Value = EmitPointerAuthSign(NewAuthInfo, Value); + else + Value = emitPointerAuthResignCall(Value, CurAuthInfo, NewAuthInfo); + + // Clean up with a phi if we branched before. + if (ContBB) { + EmitBlock(ContBB); + auto *Phi = Builder.CreatePHI(Value->getType(), 2); + Phi->addIncoming(Null, InitBB); + Phi->addIncoming(Value, ResignBB); + Value = Phi; + } + + return Value; +} + llvm::Constant * CodeGenModule::getConstantSignedPointer(llvm::Constant *Pointer, unsigned Key, llvm::Constant *StorageAddress, @@ -351,3 +474,110 @@ CodeGenModule::getVTablePointerAuthInfo(CodeGenFunction *CGF, /* IsIsaPointer */ false, /* AuthenticatesNullValues */ false, Discriminator); } + +llvm::Value *CodeGenFunction::authPointerToPointerCast(llvm::Value *ResultPtr, + QualType SourceType, + QualType DestType) { + CGPointerAuthInfo CurAuthInfo, NewAuthInfo; + if (SourceType->isSignableType()) + CurAuthInfo = getPointerAuthInfoForType(CGM, SourceType); + + if (DestType->isSignableType()) + NewAuthInfo = getPointerAuthInfoForType(CGM, DestType); + + if (!CurAuthInfo && !NewAuthInfo) + return ResultPtr; + + // If only one side of the cast is a function pointer, then we still need to + // resign to handle casts to/from opaque pointers. + if (!CurAuthInfo && DestType->isFunctionPointerType()) + CurAuthInfo = CGM.getFunctionPointerAuthInfo(SourceType); + + if (!NewAuthInfo && SourceType->isFunctionPointerType()) + NewAuthInfo = CGM.getFunctionPointerAuthInfo(DestType); + + return emitPointerAuthResign(ResultPtr, DestType, CurAuthInfo, NewAuthInfo, + /*IsKnownNonNull=*/false); +} + +Address CodeGenFunction::authPointerToPointerCast(Address Ptr, + QualType SourceType, + QualType DestType) { + CGPointerAuthInfo CurAuthInfo, NewAuthInfo; + if (SourceType->isSignableType()) + CurAuthInfo = getPointerAuthInfoForType(CGM, SourceType); + + if (DestType->isSignableType()) + NewAuthInfo = getPointerAuthInfoForType(CGM, DestType); + + if (!CurAuthInfo && !NewAuthInfo) + return Ptr; + + if (!CurAuthInfo && DestType->isFunctionPointerType()) { + // When casting a non-signed pointer to a function pointer, just set the + // auth info on Ptr to the assumed schema. The pointer will be resigned to + // the effective type when used. + Ptr.setPointerAuthInfo(CGM.getFunctionPointerAuthInfo(SourceType)); + return Ptr; + } + + if (!NewAuthInfo && SourceType->isFunctionPointerType()) { + NewAuthInfo = CGM.getFunctionPointerAuthInfo(DestType); + Ptr = Ptr.getResignedAddress(NewAuthInfo, *this); + Ptr.setPointerAuthInfo(CGPointerAuthInfo()); + return Ptr; + } + + return Ptr; +} + +Address CodeGenFunction::getAsNaturalAddressOf(Address Addr, + QualType PointeeTy) { + CGPointerAuthInfo Info = + PointeeTy.isNull() ? CGPointerAuthInfo() + : CGM.getPointerAuthInfoForPointeeType(PointeeTy); + return Addr.getResignedAddress(Info, *this); +} + +Address Address::getResignedAddress(const CGPointerAuthInfo &NewInfo, + CodeGenFunction &CGF) const { + assert(isValid() && "pointer isn't valid"); + CGPointerAuthInfo CurInfo = getPointerAuthInfo(); + llvm::Value *Val; + + // Nothing to do if neither the current or the new ptrauth info needs signing. + if (!CurInfo.isSigned() && !NewInfo.isSigned()) + return Address(getBasePointer(), getElementType(), getAlignment(), + isKnownNonNull()); + + assert(ElementType && "Effective type has to be set"); + assert(!Offset && "unexpected non-null offset"); + + // If the current and the new ptrauth infos are the same and the offset is + // null, just cast the base pointer to the effective type. + if (CurInfo == NewInfo && !hasOffset()) + Val = getBasePointer(); + else + Val = CGF.emitPointerAuthResign(getBasePointer(), QualType(), CurInfo, + NewInfo, isKnownNonNull()); + + Val = CGF.Builder.CreateBitCast(Val, getType()); + return Address(Val, getElementType(), getAlignment(), NewInfo, + /*Offset=*/nullptr, isKnownNonNull()); +} + +llvm::Value *LValue::getPointer(CodeGenFunction &CGF) const { + assert(isSimple()); + return emitResignedPointer(getType(), CGF); +} + +llvm::Value *LValue::emitResignedPointer(QualType PointeeTy, + CodeGenFunction &CGF) const { + assert(isSimple()); + return CGF.getAsNaturalAddressOf(Addr, PointeeTy).getBasePointer(); +} + +llvm::Value *LValue::emitRawPointer(CodeGenFunction &CGF) const { + assert(isSimple()); + return Addr.isValid() ? Addr.emitRawPointer(CGF) : nullptr; +} diff --git a/clang/lib/CodeGen/CGValue.h b/clang/lib/CodeGen/CGValue.h index f1ba3cf95ae59..c4ec8d207d2e3 100644 --- a/clang/lib/CodeGen/CGValue.h +++ b/clang/lib/CodeGen/CGValue.h @@ -15,6 +15,7 @@ #define LLVM_CLANG_LIB_CODEGEN_CGVALUE_H #include "Address.h" +#include "CGPointerAuthInfo.h" #include "CodeGenTBAA.h" #include "EHScopeStack.h" #include "clang/AST/ASTContext.h" @@ -233,9 +234,6 @@ class LValue { // this lvalue. bool Nontemporal : 1; - // The pointer is known not to be null. - bool IsKnownNonNull : 1; - LValueBaseInfo BaseInfo; TBAAAccessInfo TBAAInfo; @@ -263,7 +261,6 @@ class LValue { this->ImpreciseLifetime = false; this->Nontemporal = false; this->ThreadLocalRef = false; - this->IsKnownNonNull = false; this->BaseIvarExp = nullptr; } @@ -349,28 +346,26 @@ class LValue { LValueBaseInfo getBaseInfo() const { return BaseInfo; } void setBaseInfo(LValueBaseInfo Info) { BaseInfo = Info; } - KnownNonNull_t isKnownNonNull() const { - return (KnownNonNull_t)IsKnownNonNull; - } + KnownNonNull_t isKnownNonNull() const { return Addr.isKnownNonNull(); } LValue setKnownNonNull() { - IsKnownNonNull = true; + Addr.setKnownNonNull(); return *this; } // simple lvalue - llvm::Value *getPointer(CodeGenFunction &CGF) const { - assert(isSimple()); - return Addr.getBasePointer(); - } - llvm::Value *emitRawPointer(CodeGenFunction &CGF) const { - assert(isSimple()); - return Addr.isValid() ? Addr.emitRawPointer(CGF) : nullptr; - } + llvm::Value *getPointer(CodeGenFunction &CGF) const; + llvm::Value *emitResignedPointer(QualType PointeeTy, + CodeGenFunction &CGF) const; + llvm::Value *emitRawPointer(CodeGenFunction &CGF) const; Address getAddress() const { return Addr; } void setAddress(Address address) { Addr = address; } + CGPointerAuthInfo getPointerAuthInfo() const { + return Addr.getPointerAuthInfo(); + } + // vector elt lvalue Address getVectorAddress() const { assert(isVectorElt()); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index ea4635c039cb2..551db09165dbe 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -195,34 +195,46 @@ CodeGenFunction::CGFPOptionsRAII::~CGFPOptionsRAII() { CGF.Builder.setDefaultConstrainedRounding(OldRounding); } -static LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T, - bool ForPointeeType, - CodeGenFunction &CGF) { +static LValue +makeNaturalAlignAddrLValue(llvm::Value *V, QualType T, bool ForPointeeType, + bool MightBeSigned, CodeGenFunction &CGF, + KnownNonNull_t IsKnownNonNull = NotKnownNonNull) { LValueBaseInfo BaseInfo; TBAAAccessInfo TBAAInfo; CharUnits Alignment = CGF.CGM.getNaturalTypeAlignment(T, &BaseInfo, &TBAAInfo, ForPointeeType); - Address Addr = Address(V, CGF.ConvertTypeForMem(T), Alignment); + Address Addr = + MightBeSigned + ? CGF.makeNaturalAddressForPointer(V, T, Alignment, false, nullptr, + nullptr, IsKnownNonNull) + : Address(V, CGF.ConvertTypeForMem(T), Alignment, IsKnownNonNull); return CGF.MakeAddrLValue(Addr, T, BaseInfo, TBAAInfo); } -LValue CodeGenFunction::MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T) { - return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, *this); +LValue +CodeGenFunction::MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T, + KnownNonNull_t IsKnownNonNull) { + return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, + /*MightBeSigned*/ true, *this, + IsKnownNonNull); } LValue CodeGenFunction::MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T) { - return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, *this); + return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, + /*MightBeSigned*/ true, *this); } LValue CodeGenFunction::MakeNaturalAlignRawAddrLValue(llvm::Value *V, QualType T) { - return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, *this); + return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ false, + /*MightBeSigned*/ false, *this); } LValue CodeGenFunction::MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T) { - return ::MakeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, *this); + return ::makeNaturalAlignAddrLValue(V, T, /*ForPointeeType*/ true, + /*MightBeSigned*/ false, *this); } llvm::Type *CodeGenFunction::ConvertTypeForMem(QualType T) { diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 9fe4391237819..d83e38cab8e2d 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -202,7 +202,7 @@ template <> struct DominatingValue
{ } static type restore(CodeGenFunction &CGF, saved_type value) { return Address(DominatingLLVMValue::restore(CGF, value.BasePtr), - value.ElementType, value.Alignment, + value.ElementType, value.Alignment, CGPointerAuthInfo(), DominatingLLVMValue::restore(CGF, value.Offset)); } }; @@ -2689,7 +2689,8 @@ class CodeGenFunction : public CodeGenTypeCache { if (Alignment.isZero()) Alignment = CGM.getNaturalTypeAlignment(T, BaseInfo, TBAAInfo, ForPointeeType); - return Address(Ptr, ConvertTypeForMem(T), Alignment, nullptr, + return Address(Ptr, ConvertTypeForMem(T), Alignment, + CGM.getPointerAuthInfoForPointeeType(T), /*Offset=*/nullptr, IsKnownNonNull); } @@ -2730,7 +2731,9 @@ class CodeGenFunction : public CodeGenTypeCache { /// an l-value with the natural pointee alignment of T. LValue MakeNaturalAlignPointeeAddrLValue(llvm::Value *V, QualType T); - LValue MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T); + LValue + MakeNaturalAlignAddrLValue(llvm::Value *V, QualType T, + KnownNonNull_t IsKnownNonNull = NotKnownNonNull); /// Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known /// to be unsigned. @@ -4424,10 +4427,6 @@ class CodeGenFunction : public CodeGenTypeCache { CXXDtorType Type, const CXXRecordDecl *RD); - llvm::Value *getAsNaturalPointerTo(Address Addr, QualType PointeeType) { - return Addr.getBasePointer(); - } - bool isPointerKnownNonNull(const Expr *E); /// Create the discriminator from the storage address and the entity hash. @@ -4437,16 +4436,36 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *StorageAddress, GlobalDecl SchemaDecl, QualType SchemaType); - llvm::Value *EmitPointerAuthSign(QualType PointeeType, llvm::Value *Pointer); + llvm::Value *EmitPointerAuthSign(const CGPointerAuthInfo &Info, llvm::Value *Pointer); + llvm::Value *EmitPointerAuthAuth(const CGPointerAuthInfo &Info, llvm::Value *Pointer); + llvm::Value *emitPointerAuthResign(llvm::Value *Pointer, QualType PointerType, + const CGPointerAuthInfo &CurAuthInfo, + const CGPointerAuthInfo &NewAuthInfo, + bool IsKnownNonNull); + llvm::Value *emitPointerAuthResignCall(llvm::Value *Pointer, + const CGPointerAuthInfo &CurInfo, + const CGPointerAuthInfo &NewInfo); + void EmitPointerAuthOperandBundle( const CGPointerAuthInfo &Info, SmallVectorImpl &Bundles); + llvm::Value *authPointerToPointerCast(llvm::Value *ResultPtr, + QualType SourceType, QualType DestType); + Address authPointerToPointerCast(Address Ptr, QualType SourceType, + QualType DestType); + + Address getAsNaturalAddressOf(Address Addr, QualType PointeeTy); + + llvm::Value *getAsNaturalPointerTo(Address Addr, QualType PointeeType) { + return getAsNaturalAddressOf(Addr, PointeeType).getBasePointer(); + } + // Return the copy constructor name with the prefix "__copy_constructor_" // removed. static std::string getNonTrivialCopyConstructorStr(QualType QT, diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h index 40ac6dcac2ab8..e0bc8c4f9acf7 100644 --- a/clang/lib/Headers/ptrauth.h +++ b/clang/lib/Headers/ptrauth.h @@ -58,6 +58,21 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t; /* Authenticating a pointer that was not signed with the given key and extra-data value will (likely) fail by trapping. */ +/* The null function pointer is always the all-zero bit pattern. + Signing an all-zero bit pattern will embed a (likely) non-zero + signature in the result, and so the result will not seem to be + a null function pointer. Authenticating this value will yield + a null function pointer back. However, authenticating an + all-zero bit pattern will probably fail, because the + authentication will expect a (likely) non-zero signature to + embedded in the value. + + Because of this, if a pointer may validly be null, you should + check for null before attempting to authenticate it with one + of these intrinsics. This is not necessary when using the + __ptrauth qualifier; the compiler will perform this check + automatically. */ + #if __has_feature(ptrauth_intrinsics) /* Strip the signature from a value without authenticating it. diff --git a/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c b/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c new file mode 100644 index 0000000000000..7d76649e2e49c --- /dev/null +++ b/clang/test/CodeGen/ptrauth-function-lvalue-cast-disc.c @@ -0,0 +1,68 @@ +// RUN: %clang_cc1 %s -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- -fptrauth-function-pointer-type-discrimination | FileCheck -check-prefixes CHECK,TYPE %s +// RUN: %clang_cc1 %s -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -emit-llvm -o- | FileCheck -check-prefixes CHECK,ZERO %s + +typedef void (*fptr_t)(void); + +char *cptr; +void (*fptr)(void); + +// CHECK-LABEL: define void @test1 +void test1() { + // TYPE: [[LOAD:%.*]] = load ptr, ptr @cptr + // TYPE: [[TOINT:%.*]] = ptrtoint ptr [[LOAD]] to i64 + // TYPE: call i64 @llvm.ptrauth.resign(i64 [[TOINT]], i32 0, i64 0, i32 0, i64 18983) + // TYPE: call void {{.*}}() [ "ptrauth"(i32 0, i64 18983) ] + // ZERO-NOT: @llvm.ptrauth.resign + + (*(fptr_t)cptr)(); +} + +// CHECK-LABEL: define i8 @test2 +char test2() { + return *(char *)fptr; + + // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr + // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[NONNULL:.*]], label %[[CONT:.*]] + + // TYPE: [[NONNULL]]: + // TYPE: [[TOINT:%.*]] = ptrtoint ptr [[LOAD]] to i64 + // TYPE: [[CALL:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[TOINT]], i32 0, i64 18983, i32 0, i64 0) + // TYPE: [[TOPTR:%.*]] = inttoptr i64 [[CALL]] to ptr + + // TYPE: [[CONT]]: + // TYPE: phi ptr [ null, {{.*}} ], [ [[TOPTR]], %[[NONNULL]] ] + // ZERO-NOT: @llvm.ptrauth.resign +} + +// CHECK-LABEL: define void @test4 +void test4() { + (*((fptr_t)(&*((char *)(&*(fptr_t)cptr)))))(); + + // CHECK: [[LOAD:%.*]] = load ptr, ptr @cptr + // TYPE-NEXT: [[CAST4:%.*]] = ptrtoint ptr [[LOAD]] to i64 + // TYPE-NEXT: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[CAST4]], i32 0, i64 0, i32 0, i64 18983) + // TYPE-NEXT: [[CAST5:%.*]] = inttoptr i64 [[RESIGN]] to ptr + // TYPE-NEXT: call void [[CAST5]]() [ "ptrauth"(i32 0, i64 18983) ] + // ZERO-NOT: @llvm.ptrauth.resign + // ZERO: call void [[LOAD]]() [ "ptrauth"(i32 0, i64 0) ] +} + +void *vptr; +// CHECK-LABEL: define void @test5 +void test5() { + vptr = &*(char *)fptr; + + // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr + // TYPE-NEXT: [[CMP]] = icmp ne ptr [[LOAD]], null + // TYPE-NEXT: br i1 [[CMP]], label %[[NONNULL:.*]], label %[[CONT:.*]] + + // TYPE: [[NONNULL]]: + // TYPE: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 18983, i32 0, i64 0) + // TYPE: [[CAST:%.*]] = inttoptr i64 [[RESIGN]] to ptr + + // TYPE: [[CONT]]: + // TYPE: [[PHI:%.*]] = phi ptr [ null, {{.*}} ], [ [[CAST]], %[[NONNULL]] ] + // TYPE: store ptr [[PHI]], ptr @vptr + // ZERO-NOT: @llvm.ptrauth.resign +} diff --git a/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c b/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c new file mode 100644 index 0000000000000..cdf9ee4907525 --- /dev/null +++ b/clang/test/CodeGen/ptrauth-function-type-discriminator-cast.c @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 %s -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck %s --check-prefixes=CHECK,TYPE +// RUN: %clang_cc1 %s -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck %s --check-prefixes=CHECK,ZERO +// RUN: %clang_cc1 -xc++ %s -fptrauth-function-pointer-type-discrimination -triple arm64e-apple-ios13 -fptrauth-calls -fptrauth-intrinsics -disable-llvm-passes -emit-llvm -o- | FileCheck %s --check-prefixes=CHECK,CHECKCXX,TYPE,TYPECXX + +#ifdef __cplusplus +extern "C" { +#endif + +void f(void); +void f2(int); +void (*fptr)(void); +void *opaque; +unsigned long uintptr; + +#ifdef __cplusplus +struct ptr_member { + void (*fptr_)(int) = 0; +}; +ptr_member pm; +void (*test_member)() = (void (*)())pm.fptr_; + +// CHECKCXX-LABEL: define internal void @__cxx_global_var_init +// TYPECXX: call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 2712, i32 0, i64 18983) +#endif + + +// CHECK-LABEL: define void @test_cast_to_opaque +void test_cast_to_opaque() { + opaque = (void *)f; + + // TYPE: [[RESIGN_VAL:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @f, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 0, i64 0) + // TYPE: [[RESIGN_PTR:%.*]] = inttoptr i64 [[RESIGN_VAL]] to ptr + // ZERO-NOT: @llvm.ptrauth.resign +} + +// CHECK-LABEL: define void @test_cast_from_opaque +void test_cast_from_opaque() { + fptr = (void (*)(void))opaque; + + // TYPE: [[LOAD:%.*]] = load ptr, ptr @opaque + // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null + // TYPE: br i1 [[CMP]], label %[[RESIGN_LAB:.*]], label + + // TYPE: [[RESIGN_LAB]]: + // TYPE: [[INT:%.*]] = ptrtoint ptr [[LOAD]] to i64 + // TYPE: [[RESIGN_INT:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[INT]], i32 0, i64 0, i32 0, i64 18983) + + // ZERO-NOT: @llvm.ptrauth.resign +} + +// CHECK-LABEL: define void @test_cast_to_intptr +void test_cast_to_intptr() { + uintptr = (unsigned long)fptr; + + // TYPE: [[ENTRY:.*]]: + // TYPE: [[LOAD:%.*]] = load ptr, ptr @fptr + // TYPE: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null + // TYPE: br i1 [[CMP]], label %[[RESIGN_LAB:.*]], label %[[RESIGN_CONT:.*]] + + // TYPE: [[RESIGN_LAB]]: + // TYPE: [[INT:%.*]] = ptrtoint ptr [[LOAD]] to i64 + // TYPE: [[RESIGN_INT:%.*]] = call i64 @llvm.ptrauth.resign(i64 [[INT]], i32 0, i64 18983, i32 0, i64 0) + // TYPE: [[RESIGN:%.*]] = inttoptr i64 [[RESIGN_INT]] to ptr + // TYPE: br label %[[RESIGN_CONT]] + + // TYPE: [[RESIGN_CONT]]: + // TYPE: phi ptr [ null, %[[ENTRY]] ], [ [[RESIGN]], %[[RESIGN_LAB]] ] + + // ZERO-NOT: @llvm.ptrauth.resign +} + +// CHECK-LABEL: define void @test_function_to_function_cast +void test_function_to_function_cast() { + void (*fptr2)(int) = (void (*)(int))fptr; + // TYPE: call i64 @llvm.ptrauth.resign(i64 {{.*}}, i32 0, i64 18983, i32 0, i64 2712) + // ZERO-NOT: @llvm.ptrauth.resign +} + +// CHECK-LABEL: define void @test_call_lvalue_cast +void test_call_lvalue_cast() { + (*(void (*)(int))f)(42); + + // TYPE: entry: + // TYPE-NEXT: [[RESIGN:%.*]] = call i64 @llvm.ptrauth.resign(i64 ptrtoint (ptr ptrauth (ptr @f, i32 0, i64 18983) to i64), i32 0, i64 18983, i32 0, i64 2712) + // TYPE-NEXT: [[RESIGN_INT:%.*]] = inttoptr i64 [[RESIGN]] to ptr + // TYPE-NEXT: call void [[RESIGN_INT]](i32 noundef 42) [ "ptrauth"(i32 0, i64 2712) ] + // ZERO-NOT: @llvm.ptrauth.resign + // ZERO: call void ptrauth (ptr @f, i32 0)(i32 noundef 42) [ "ptrauth"(i32 0, i64 0) ] +} + + +#ifdef __cplusplus +} +#endif From 676efd0ffb717215c752f200fe14163732290dcc Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Thu, 18 Jul 2024 15:22:07 +0100 Subject: [PATCH 033/486] Reapply 078198f310d5 "Index DebugVariables and some DILocations" Now revised to actually make the unit test compile, which I'd been ignoring. No actual functional change, it's a type difference. Original commit message follows. [DebugInfo][InstrRef] Index DebugVariables and some DILocations (#99318) A lot of time in LiveDebugValues is spent computing DenseMap keys for DebugVariables, and they're made up of three pointers, so are large. This patch installs an index for them: for the SSA and value-to-location mapping parts of InstrRefBasedLDV we don't need to access things like the variable declaration or the inlining site, so just use a uint32_t identifier for each variable fragment that's tracked. The compile-time performance improvements are substantial (almost 0.4% on the tracker). About 80% of this patch is just replacing DebugVariable references with DebugVariableIDs instead, however there are some larger consequences. We spend lots of time fetching DILocations when emitting DBG_VALUE instructions, so index those with the DebugVariables: this means all DILocations on all new DBG_VALUE instructions will normalise to the first-seen DILocation for the variable (which should be fine). We also used to keep an ordering of when each variable was seen first in a DBG_* instruction, in the AllVarsNumbering collection, so that we can emit new DBG_* instructions in a stable order. We can hang this off the DebugVariable index instead, so AllVarsNumbering is deleted. Finally, rather than ordering by AllVarsNumbering just before DBG_* instructions are linked into the output MIR, store instructions along with their DebugVariableID, so that they can be sorted by that instead. --- .../LiveDebugValues/InstrRefBasedImpl.cpp | 201 +++++++++--------- .../LiveDebugValues/InstrRefBasedImpl.h | 111 +++++++--- .../MIR/X86/live-debug-values-fragments.mir | 4 +- llvm/unittests/CodeGen/InstrRefLDVTest.cpp | 129 +++++------ 4 files changed, 256 insertions(+), 189 deletions(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 247258a1ff553..b9cf36a07846c 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -183,6 +183,7 @@ class TransferTracker { /// information from it. (XXX make it const?) MLocTracker *MTracker; MachineFunction &MF; + const DebugVariableMap &DVMap; bool ShouldEmitDebugEntryValues; /// Record of all changes in variable locations at a block position. Awkwardly @@ -191,7 +192,9 @@ class TransferTracker { struct Transfer { MachineBasicBlock::instr_iterator Pos; /// Position to insert DBG_VALUes MachineBasicBlock *MBB; /// non-null if we should insert after. - SmallVector Insts; /// Vector of DBG_VALUEs to insert. + /// Vector of DBG_VALUEs to insert. Store with their DebugVariableID so that + /// they can be sorted into a stable order for emission at a later time. + SmallVector, 4> Insts; }; /// Stores the resolved operands (machine locations and constants) and @@ -227,15 +230,15 @@ class TransferTracker { /// Map from LocIdxes to which DebugVariables are based that location. /// Mantained while stepping through the block. Not accurate if /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx]. - DenseMap> ActiveMLocs; + DenseMap> ActiveMLocs; /// Map from DebugVariable to it's current location and qualifying meta /// information. To be used in conjunction with ActiveMLocs to construct /// enough information for the DBG_VALUEs for a particular LocIdx. - DenseMap ActiveVLocs; + DenseMap ActiveVLocs; /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection. - SmallVector PendingDbgValues; + SmallVector, 4> PendingDbgValues; /// Record of a use-before-def: created when a value that's live-in to the /// current block isn't available in any machine location, but it will be @@ -244,12 +247,12 @@ class TransferTracker { /// Value of this variable, def'd in block. SmallVector Values; /// Identity of this variable. - DebugVariable Var; + DebugVariableID VarID; /// Additional variable properties. DbgValueProperties Properties; - UseBeforeDef(ArrayRef Values, const DebugVariable &Var, + UseBeforeDef(ArrayRef Values, DebugVariableID VarID, const DbgValueProperties &Properties) - : Values(Values.begin(), Values.end()), Var(Var), + : Values(Values.begin(), Values.end()), VarID(VarID), Properties(Properties) {} }; @@ -260,15 +263,16 @@ class TransferTracker { /// The set of variables that are in UseBeforeDefs and can become a location /// once the relevant value is defined. An element being erased from this /// collection prevents the use-before-def materializing. - DenseSet UseBeforeDefVariables; + DenseSet UseBeforeDefVariables; const TargetRegisterInfo &TRI; const BitVector &CalleeSavedRegs; TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker, - MachineFunction &MF, const TargetRegisterInfo &TRI, + MachineFunction &MF, const DebugVariableMap &DVMap, + const TargetRegisterInfo &TRI, const BitVector &CalleeSavedRegs, const TargetPassConfig &TPC) - : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI), + : TII(TII), MTracker(MTracker), MF(MF), DVMap(DVMap), TRI(TRI), CalleeSavedRegs(CalleeSavedRegs) { TLI = MF.getSubtarget().getTargetLowering(); auto &TM = TPC.getTM(); @@ -352,7 +356,7 @@ class TransferTracker { /// determine the values used by Value. void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore, const SmallVectorImpl &ValueToLoc, - DebugVariable Var, DbgValue Value) { + DebugVariableID VarID, DbgValue Value) { SmallVector DbgOps; SmallVector ResolvedDbgOps; bool IsValueValid = true; @@ -401,7 +405,7 @@ class TransferTracker { static_cast(Num.getInst())); continue; } - recoverAsEntryValue(Var, Value.Properties, Num); + recoverAsEntryValue(VarID, Value.Properties, Num); IsValueValid = false; break; } @@ -419,8 +423,7 @@ class TransferTracker { // Add UseBeforeDef entry for the last value to be defined in this block. if (LastUseBeforeDef) { - addUseBeforeDef(Var, Value.Properties, DbgOps, - LastUseBeforeDef); + addUseBeforeDef(VarID, Value.Properties, DbgOps, LastUseBeforeDef); return; } @@ -428,13 +431,15 @@ class TransferTracker { // the transfer. for (const ResolvedDbgOp &Op : ResolvedDbgOps) if (!Op.IsConst) - ActiveMLocs[Op.Loc].insert(Var); + ActiveMLocs[Op.Loc].insert(VarID); auto NewValue = ResolvedDbgValue{ResolvedDbgOps, Value.Properties}; - auto Result = ActiveVLocs.insert(std::make_pair(Var, NewValue)); + auto Result = ActiveVLocs.insert(std::make_pair(VarID, NewValue)); if (!Result.second) Result.first->second = NewValue; + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); PendingDbgValues.push_back( - MTracker->emitLoc(ResolvedDbgOps, Var, Value.Properties)); + std::make_pair(VarID, &*MTracker->emitLoc(ResolvedDbgOps, Var, DILoc, + Value.Properties))); } /// Load object with live-in variable values. \p mlocs contains the live-in @@ -445,7 +450,7 @@ class TransferTracker { /// FIXME: could just examine mloctracker instead of passing in \p mlocs? void loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs, DbgOpIDMap &DbgOpStore, - const SmallVectorImpl> &VLocs, + const SmallVectorImpl> &VLocs, unsigned NumLocs) { ActiveMLocs.clear(); ActiveVLocs.clear(); @@ -506,11 +511,11 @@ class TransferTracker { /// Record that \p Var has value \p ID, a value that becomes available /// later in the function. - void addUseBeforeDef(const DebugVariable &Var, + void addUseBeforeDef(DebugVariableID VarID, const DbgValueProperties &Properties, const SmallVectorImpl &DbgOps, unsigned Inst) { - UseBeforeDefs[Inst].emplace_back(DbgOps, Var, Properties); - UseBeforeDefVariables.insert(Var); + UseBeforeDefs[Inst].emplace_back(DbgOps, VarID, Properties); + UseBeforeDefVariables.insert(VarID); } /// After the instruction at index \p Inst and position \p pos has been @@ -529,7 +534,7 @@ class TransferTracker { // Populate ValueToLoc with illegal default mappings for every value used by // any UseBeforeDef variables for this instruction. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.Var)) + if (!UseBeforeDefVariables.count(Use.VarID)) continue; for (DbgOp &Op : Use.Values) { @@ -568,7 +573,7 @@ class TransferTracker { // Using the map of values to locations, produce a final set of values for // this variable. for (auto &Use : MIt->second) { - if (!UseBeforeDefVariables.count(Use.Var)) + if (!UseBeforeDefVariables.count(Use.VarID)) continue; SmallVector DbgOps; @@ -591,8 +596,9 @@ class TransferTracker { continue; // Otherwise, we're good to go. - PendingDbgValues.push_back( - MTracker->emitLoc(DbgOps, Use.Var, Use.Properties)); + auto &[Var, DILoc] = DVMap.lookupDVID(Use.VarID); + PendingDbgValues.push_back(std::make_pair( + Use.VarID, MTracker->emitLoc(DbgOps, Var, DILoc, Use.Properties))); } flushDbgValues(pos, nullptr); } @@ -642,7 +648,7 @@ class TransferTracker { return Reg != SP && Reg != FP; } - bool recoverAsEntryValue(const DebugVariable &Var, + bool recoverAsEntryValue(DebugVariableID VarID, const DbgValueProperties &Prop, const ValueIDNum &Num) { // Is this variable location a candidate to be an entry value. First, @@ -663,6 +669,8 @@ class TransferTracker { DIExpr = *NonVariadicExpression; } + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + // Is the variable appropriate for entry values (i.e., is a parameter). if (!isEntryValueVariable(Var, DIExpr)) return false; @@ -676,9 +684,8 @@ class TransferTracker { DIExpression::prepend(DIExpr, DIExpression::EntryValue); Register Reg = MTracker->LocIdxToLocID[Num.getLoc()]; MachineOperand MO = MachineOperand::CreateReg(Reg, false); - - PendingDbgValues.push_back( - emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false})); + PendingDbgValues.push_back(std::make_pair( + VarID, &*emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false}))); return true; } @@ -687,19 +694,20 @@ class TransferTracker { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); DbgValueProperties Properties(MI); + DebugVariableID VarID = DVMap.getDVID(Var); // Ignore non-register locations, we don't transfer those. if (MI.isUndefDebugValue() || all_of(MI.debug_operands(), [](const MachineOperand &MO) { return !MO.isReg(); })) { - auto It = ActiveVLocs.find(Var); + auto It = ActiveVLocs.find(VarID); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(Var); + ActiveMLocs[Loc].erase(VarID); ActiveVLocs.erase(It); } // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(Var); + UseBeforeDefVariables.erase(VarID); return; } @@ -725,14 +733,15 @@ class TransferTracker { SmallVectorImpl &NewLocs) { DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); + DebugVariableID VarID = DVMap.getDVID(Var); // Any use-before-defs no longer apply. - UseBeforeDefVariables.erase(Var); + UseBeforeDefVariables.erase(VarID); // Erase any previous location. - auto It = ActiveVLocs.find(Var); + auto It = ActiveVLocs.find(VarID); if (It != ActiveVLocs.end()) { for (LocIdx Loc : It->second.loc_indices()) - ActiveMLocs[Loc].erase(Var); + ActiveMLocs[Loc].erase(VarID); } // If there _is_ no new location, all we had to do was erase. @@ -742,7 +751,7 @@ class TransferTracker { return; } - SmallVector> LostMLocs; + SmallVector> LostMLocs; for (ResolvedDbgOp &Op : NewLocs) { if (Op.IsConst) continue; @@ -769,17 +778,17 @@ class TransferTracker { for (const auto &LostMLoc : LostMLocs) ActiveMLocs[LostMLoc.first].erase(LostMLoc.second); LostMLocs.clear(); - It = ActiveVLocs.find(Var); + It = ActiveVLocs.find(VarID); ActiveMLocs[NewLoc.asU64()].clear(); VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc); } - ActiveMLocs[NewLoc].insert(Var); + ActiveMLocs[NewLoc].insert(VarID); } if (It == ActiveVLocs.end()) { ActiveVLocs.insert( - std::make_pair(Var, ResolvedDbgValue(NewLocs, Properties))); + std::make_pair(VarID, ResolvedDbgValue(NewLocs, Properties))); } else { It->second.Ops.assign(NewLocs); It->second.Properties = Properties; @@ -822,21 +831,21 @@ class TransferTracker { // explicitly undef, then stop here. if (!NewLoc && !MakeUndef) { // Try and recover a few more locations with entry values. - for (const auto &Var : ActiveMLocIt->second) { - auto &Prop = ActiveVLocs.find(Var)->second.Properties; - recoverAsEntryValue(Var, Prop, OldValue); + for (DebugVariableID VarID : ActiveMLocIt->second) { + auto &Prop = ActiveVLocs.find(VarID)->second.Properties; + recoverAsEntryValue(VarID, Prop, OldValue); } flushDbgValues(Pos, nullptr); return; } // Examine all the variables based on this location. - DenseSet NewMLocs; + DenseSet NewMLocs; // If no new location has been found, every variable that depends on this // MLoc is dead, so end their existing MLoc->Var mappings as well. - SmallVector> LostMLocs; - for (const auto &Var : ActiveMLocIt->second) { - auto ActiveVLocIt = ActiveVLocs.find(Var); + SmallVector> LostMLocs; + for (DebugVariableID VarID : ActiveMLocIt->second) { + auto ActiveVLocIt = ActiveVLocs.find(VarID); // Re-state the variable location: if there's no replacement then NewLoc // is std::nullopt and a $noreg DBG_VALUE will be created. Otherwise, a // DBG_VALUE identifying the alternative location will be emitted. @@ -855,19 +864,21 @@ class TransferTracker { replace_copy(ActiveVLocIt->second.Ops, DbgOps.begin(), OldOp, NewOp); } - PendingDbgValues.push_back(MTracker->emitLoc(DbgOps, Var, Properties)); + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + PendingDbgValues.push_back(std::make_pair( + VarID, &*MTracker->emitLoc(DbgOps, Var, DILoc, Properties))); // Update machine locations <=> variable locations maps. Defer updating // ActiveMLocs to avoid invalidating the ActiveMLocIt iterator. if (!NewLoc) { for (LocIdx Loc : ActiveVLocIt->second.loc_indices()) { if (Loc != MLoc) - LostMLocs.emplace_back(Loc, Var); + LostMLocs.emplace_back(Loc, VarID); } ActiveVLocs.erase(ActiveVLocIt); } else { ActiveVLocIt->second.Ops = DbgOps; - NewMLocs.insert(Var); + NewMLocs.insert(VarID); } } @@ -891,8 +902,8 @@ class TransferTracker { // Commit ActiveMLoc changes. ActiveMLocIt->second.clear(); if (!NewMLocs.empty()) - for (auto &Var : NewMLocs) - ActiveMLocs[*NewLoc].insert(Var); + for (DebugVariableID VarID : NewMLocs) + ActiveMLocs[*NewLoc].insert(VarID); } /// Transfer variables based on \p Src to be based on \p Dst. This handles @@ -915,17 +926,18 @@ class TransferTracker { // For each variable based on Src; create a location at Dst. ResolvedDbgOp SrcOp(Src); ResolvedDbgOp DstOp(Dst); - for (const auto &Var : MovingVars) { - auto ActiveVLocIt = ActiveVLocs.find(Var); + for (DebugVariableID VarID : MovingVars) { + auto ActiveVLocIt = ActiveVLocs.find(VarID); assert(ActiveVLocIt != ActiveVLocs.end()); // Update all instances of Src in the variable's tracked values to Dst. std::replace(ActiveVLocIt->second.Ops.begin(), ActiveVLocIt->second.Ops.end(), SrcOp, DstOp); - MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var, DILoc, ActiveVLocIt->second.Properties); - PendingDbgValues.push_back(MI); + PendingDbgValues.push_back(std::make_pair(VarID, MI)); } ActiveMLocs[Src].clear(); flushDbgValues(Pos, nullptr); @@ -1176,11 +1188,9 @@ LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() { MachineInstrBuilder MLocTracker::emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, + const DebugVariable &Var, const DILocation *DILoc, const DbgValueProperties &Properties) { - DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0, - Var.getVariable()->getScope(), - const_cast(Var.getInlinedAt())); + DebugLoc DL = DebugLoc(DILoc); const MCInstrDesc &Desc = Properties.IsVariadic ? TII.get(TargetOpcode::DBG_VALUE_LIST) @@ -1726,7 +1736,8 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, LastUseBeforeDef = std::max(LastUseBeforeDef, NewID.getInst()); } if (IsValidUseBeforeDef) { - TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false, true}, + DebugVariableID VID = DVMap.insertDVID(V, MI.getDebugLoc().get()); + TTracker->addUseBeforeDef(VID, {MI.getDebugExpression(), false, true}, DbgOps, LastUseBeforeDef); } } @@ -1735,9 +1746,11 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI, // This DBG_VALUE is potentially a $noreg / undefined location, if // FoundLoc is illegal. // (XXX -- could morph the DBG_INSTR_REF in the future). - MachineInstr *DbgMI = MTracker->emitLoc(NewLocs, V, Properties); + MachineInstr *DbgMI = + MTracker->emitLoc(NewLocs, V, MI.getDebugLoc().get(), Properties); + DebugVariableID ID = DVMap.getDVID(V); - TTracker->PendingDbgValues.push_back(DbgMI); + TTracker->PendingDbgValues.push_back(std::make_pair(ID, DbgMI)); TTracker->flushDbgValues(MI.getIterator(), nullptr); return true; } @@ -3112,7 +3125,8 @@ void InstrRefBasedLDV::getBlocksForScope( } void InstrRefBasedLDV::buildVLocValueMap( - const DILocation *DILoc, const SmallSet &VarsWeCareAbout, + const DILocation *DILoc, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs) { @@ -3188,7 +3202,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // between blocks. This keeps the locality of working on one lexical scope at // at time, but avoids re-processing variable values because some other // variable has been assigned. - for (const auto &Var : VarsWeCareAbout) { + for (DebugVariableID VarID : VarsWeCareAbout) { // Re-initialize live-ins and live-outs, to clear the remains of previous // variables live-ins / live-outs. for (unsigned int I = 0; I < NumBlocks; ++I) { @@ -3202,7 +3216,7 @@ void InstrRefBasedLDV::buildVLocValueMap( SmallPtrSet DefBlocks; for (const MachineBasicBlock *ExpMBB : BlocksToExplore) { auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars; - if (TransferFunc.contains(Var)) + if (TransferFunc.contains(VarID)) DefBlocks.insert(const_cast(ExpMBB)); } @@ -3212,7 +3226,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // only one value definition, things are very simple. if (DefBlocks.size() == 1) { placePHIsForSingleVarDefinition(MutBlocksToExplore, *DefBlocks.begin(), - AllTheVLocs, Var, Output); + AllTheVLocs, VarID, Output); continue; } @@ -3285,7 +3299,7 @@ void InstrRefBasedLDV::buildVLocValueMap( // Do transfer function. auto &VTracker = AllTheVLocs[MBB->getNumber()]; - auto TransferIt = VTracker.Vars.find(Var); + auto TransferIt = VTracker.Vars.find(VarID); if (TransferIt != VTracker.Vars.end()) { // Erase on empty transfer (DBG_VALUE $noreg). if (TransferIt->second.Kind == DbgValue::Undef) { @@ -3347,9 +3361,11 @@ void InstrRefBasedLDV::buildVLocValueMap( continue; if (BlockLiveIn->Kind == DbgValue::VPHI) BlockLiveIn->Kind = DbgValue::Def; + auto &[Var, DILoc] = DVMap.lookupDVID(VarID); assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() == - Var.getFragment() && "Fragment info missing during value prop"); - Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn)); + Var.getFragment() && + "Fragment info missing during value prop"); + Output[MBB->getNumber()].push_back(std::make_pair(VarID, *BlockLiveIn)); } } // Per-variable loop. @@ -3360,7 +3376,7 @@ void InstrRefBasedLDV::buildVLocValueMap( void InstrRefBasedLDV::placePHIsForSingleVarDefinition( const SmallPtrSetImpl &InScopeBlocks, MachineBasicBlock *AssignMBB, SmallVectorImpl &AllTheVLocs, - const DebugVariable &Var, LiveInsT &Output) { + DebugVariableID VarID, LiveInsT &Output) { // If there is a single definition of the variable, then working out it's // value everywhere is very simple: it's every block dominated by the // definition. At the dominance frontier, the usual algorithm would: @@ -3373,7 +3389,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( // Pick out the variables value from the block transfer function. VLocTracker &VLocs = AllTheVLocs[AssignMBB->getNumber()]; - auto ValueIt = VLocs.Vars.find(Var); + auto ValueIt = VLocs.Vars.find(VarID); const DbgValue &Value = ValueIt->second; // If it's an explicit assignment of "undef", that means there is no location @@ -3388,7 +3404,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition( if (!DomTree->properlyDominates(AssignMBB, ScopeBlock)) continue; - Output[ScopeBlock->getNumber()].push_back({Var, Value}); + Output[ScopeBlock->getNumber()].push_back({VarID, Value}); } // All blocks that aren't dominated have no live-in value, thus no variable @@ -3515,9 +3531,9 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, SmallVectorImpl &AllTheVLocs, MachineFunction &MF, - DenseMap &AllVarsNumbering, const TargetPassConfig &TPC) { - TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC); + TTracker = + new TransferTracker(TII, MTracker, MF, DVMap, *TRI, CalleeSavedRegs, TPC); unsigned NumLocs = MTracker->getNumLocs(); VTracker = nullptr; @@ -3622,31 +3638,24 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit( if (MInLocs.hasTableFor(*MBB)) EjectBlock(*MBB); - return emitTransfers(AllVarsNumbering); + return emitTransfers(); } -bool InstrRefBasedLDV::emitTransfers( - DenseMap &AllVarsNumbering) { +bool InstrRefBasedLDV::emitTransfers() { // Go through all the transfers recorded in the TransferTracker -- this is // both the live-ins to a block, and any movements of values that happen // in the middle. - for (const auto &P : TTracker->Transfers) { + for (auto &P : TTracker->Transfers) { // We have to insert DBG_VALUEs in a consistent order, otherwise they // appear in DWARF in different orders. Use the order that they appear // when walking through each block / each instruction, stored in - // AllVarsNumbering. - SmallVector> Insts; - for (MachineInstr *MI : P.Insts) { - DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(), - MI->getDebugLoc()->getInlinedAt()); - Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI); - } - llvm::sort(Insts, llvm::less_first()); + // DVMap. + llvm::sort(P.Insts, llvm::less_first()); // Insert either before or after the designated point... if (P.MBB) { MachineBasicBlock &MBB = *P.MBB; - for (const auto &Pair : Insts) + for (const auto &Pair : P.Insts) MBB.insert(P.Pos, Pair.second); } else { // Terminators, like tail calls, can clobber things. Don't try and place @@ -3655,7 +3664,7 @@ bool InstrRefBasedLDV::emitTransfers( continue; MachineBasicBlock &MBB = *P.Pos->getParent(); - for (const auto &Pair : Insts) + for (const auto &Pair : P.Insts) MBB.insertAfterBundle(P.Pos, Pair.second); } } @@ -3710,7 +3719,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, initialSetup(MF); MLocTransfer.resize(MaxNumBlocks); - vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr)); + vlocs.resize(MaxNumBlocks, VLocTracker(DVMap, OverlapFragments, EmptyExpr)); SavedLiveIns.resize(MaxNumBlocks); produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks); @@ -3766,10 +3775,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, MTracker->reset(); } - // Number all variables in the order that they appear, to be used as a stable - // insertion order later. - DenseMap AllVarsNumbering; - // Map from one LexicalScope to all the variables in that scope. ScopeToVarsT ScopeToVars; @@ -3788,16 +3793,15 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, auto *VTracker = &vlocs[MBB->getNumber()]; // Collect each variable with a DBG_VALUE in this block. for (auto &idx : VTracker->Vars) { - const auto &Var = idx.first; - const DILocation *ScopeLoc = VTracker->Scopes[Var]; + DebugVariableID VarID = idx.first; + const DILocation *ScopeLoc = VTracker->Scopes[VarID]; assert(ScopeLoc != nullptr); auto *Scope = LS.findLexicalScope(ScopeLoc); // No insts in scope -> shouldn't have been recorded. assert(Scope != nullptr); - AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size())); - ScopeToVars[Scope].insert(Var); + ScopeToVars[Scope].insert(VarID); ScopeToAssignBlocks[Scope].insert(VTracker->MBB); ScopeToDILocation[Scope] = ScopeLoc; ++VarAssignCount; @@ -3821,7 +3825,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, // the "else" block of this condition. Changed = depthFirstVLocAndEmit( MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks, - SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC); + SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, *TPC); } delete MTracker; @@ -3840,6 +3844,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, SeenFragments.clear(); SeenDbgPHIs.clear(); DbgOpStore.clear(); + DVMap.clear(); return Changed; } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h index 8770983481c2f..d9851ad13eab2 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h @@ -35,6 +35,44 @@ class DbgOpIDMap; using namespace llvm; +using DebugVariableID = unsigned; +using VarAndLoc = std::pair; + +/// Mapping from DebugVariable to/from a unique identifying number. Each +/// DebugVariable consists of three pointers, and after a small amount of +/// work to identify overlapping fragments of variables we mostly only use +/// DebugVariables as identities of variables. It's much more compile-time +/// efficient to use an ID number instead, which this class provides. +class DebugVariableMap { + DenseMap VarToIdx; + SmallVector IdxToVar; + +public: + DebugVariableID getDVID(const DebugVariable &Var) const { + auto It = VarToIdx.find(Var); + assert(It != VarToIdx.end()); + return It->second; + } + + DebugVariableID insertDVID(DebugVariable &Var, const DILocation *Loc) { + unsigned Size = VarToIdx.size(); + auto ItPair = VarToIdx.insert({Var, Size}); + if (ItPair.second) { + IdxToVar.push_back({Var, Loc}); + return Size; + } + + return ItPair.first->second; + } + + const VarAndLoc &lookupDVID(DebugVariableID ID) const { return IdxToVar[ID]; } + + void clear() { + VarToIdx.clear(); + IdxToVar.clear(); + } +}; + /// Handle-class for a particular "location". This value-type uniquely /// symbolises a register or stack location, allowing manipulation of locations /// without concern for where that location is. Practically, this allows us to @@ -985,7 +1023,7 @@ class MLocTracker { /// information in \pProperties, for variable Var. Don't insert it anywhere, /// just return the builder for it. MachineInstrBuilder emitLoc(const SmallVectorImpl &DbgOps, - const DebugVariable &Var, + const DebugVariable &Var, const DILocation *DILoc, const DbgValueProperties &Properties); }; @@ -1003,38 +1041,45 @@ using OverlapMap = /// identified. class VLocTracker { public: + /// Ref to function-wide map of DebugVariable <=> ID-numbers. + DebugVariableMap &DVMap; /// Map DebugVariable to the latest Value it's defined to have. /// Needs to be a MapVector because we determine order-in-the-input-MIR from - /// the order in this container. + /// the order in this container. (FIXME: likely no longer true as the ordering + /// is now provided by DebugVariableMap). /// We only retain the last DbgValue in each block for each variable, to /// determine the blocks live-out variable value. The Vars container forms the /// transfer function for this block, as part of the dataflow analysis. The /// movement of values between locations inside of a block is handled at a /// much later stage, in the TransferTracker class. - MapVector Vars; - SmallDenseMap Scopes; + MapVector Vars; + SmallDenseMap Scopes; MachineBasicBlock *MBB = nullptr; const OverlapMap &OverlappingFragments; DbgValueProperties EmptyProperties; public: - VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr) - : OverlappingFragments(O), EmptyProperties(EmptyExpr, false, false) {} + VLocTracker(DebugVariableMap &DVMap, const OverlapMap &O, + const DIExpression *EmptyExpr) + : DVMap(DVMap), OverlappingFragments(O), + EmptyProperties(EmptyExpr, false, false) {} void defVar(const MachineInstr &MI, const DbgValueProperties &Properties, const SmallVectorImpl &DebugOps) { assert(MI.isDebugValueLike()); DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), MI.getDebugLoc()->getInlinedAt()); + // Either insert or fetch an ID number for this variable. + DebugVariableID VarID = DVMap.insertDVID(Var, MI.getDebugLoc().get()); DbgValue Rec = (DebugOps.size() > 0) ? DbgValue(DebugOps, Properties) : DbgValue(Properties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(Var, Rec)); + auto Result = Vars.insert(std::make_pair(VarID, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[Var] = MI.getDebugLoc().get(); + Scopes[VarID] = MI.getDebugLoc().get(); considerOverlaps(Var, MI.getDebugLoc().get()); } @@ -1056,13 +1101,15 @@ class VLocTracker { DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo, Var.getInlinedAt()); + // Produce an ID number for this overlapping fragment of a variable. + DebugVariableID OverlappedID = DVMap.insertDVID(Overlapped, Loc); DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef); // Attempt insertion; overwrite if it's already mapped. - auto Result = Vars.insert(std::make_pair(Overlapped, Rec)); + auto Result = Vars.insert(std::make_pair(OverlappedID, Rec)); if (!Result.second) Result.first->second = Rec; - Scopes[Overlapped] = Loc; + Scopes[OverlappedID] = Loc; } } @@ -1093,7 +1140,7 @@ class InstrRefBasedLDV : public LDVImpl { /// variables to their values. using LiveIdxT = DenseMap; - using VarAndLoc = std::pair; + using VarAndLoc = std::pair; /// Type for a live-in value: the predecessor block, and its value. using InValueT = std::pair; @@ -1106,7 +1153,8 @@ class InstrRefBasedLDV : public LDVImpl { using ScopeToDILocT = DenseMap; /// Mapping from lexical scopes to variables in that scope. - using ScopeToVarsT = DenseMap>; + using ScopeToVarsT = + DenseMap>; /// Mapping from lexical scopes to blocks where variables in that scope are /// assigned. Such blocks aren't necessarily "in" the lexical scope, it's @@ -1200,6 +1248,11 @@ class InstrRefBasedLDV : public LDVImpl { DbgOpIDMap DbgOpStore; + /// Mapping between DebugVariables and unique ID numbers. This is a more + /// efficient way to represent the identity of a variable, versus a plain + /// DebugVariable. + DebugVariableMap DVMap; + /// True if we need to examine call instructions for stack clobbers. We /// normally assume that they don't clobber SP, but stack probes on Windows /// do. @@ -1330,9 +1383,9 @@ class InstrRefBasedLDV : public LDVImpl { /// performance as it doesn't have to find the dominance frontier between /// different assignments. void placePHIsForSingleVarDefinition( - const SmallPtrSetImpl &InScopeBlocks, - MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, - const DebugVariable &Var, LiveInsT &Output); + const SmallPtrSetImpl &InScopeBlocks, + MachineBasicBlock *MBB, SmallVectorImpl &AllTheVLocs, + DebugVariableID Var, LiveInsT &Output); /// Calculate the iterated-dominance-frontier for a set of defs, using the /// existing LLVM facilities for this. Works for a single "value" or @@ -1381,7 +1434,7 @@ class InstrRefBasedLDV : public LDVImpl { /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks /// locations through. void buildVLocValueMap(const DILocation *DILoc, - const SmallSet &VarsWeCareAbout, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, @@ -1414,10 +1467,8 @@ class InstrRefBasedLDV : public LDVImpl { const SmallVectorImpl &BlockOrders); /// Take collections of DBG_VALUE instructions stored in TTracker, and - /// install them into their output blocks. Preserves a stable order of - /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through - /// the AllVarsNumbering order. - bool emitTransfers(DenseMap &AllVarsNumbering); + /// install them into their output blocks. + bool emitTransfers(); /// Boilerplate computation of some initial sets, artifical blocks and /// RPOT block ordering. @@ -1437,13 +1488,14 @@ class InstrRefBasedLDV : public LDVImpl { /// block information can be fully computed before exploration finishes, /// allowing us to emit it and free data structures earlier than otherwise. /// It's also good for locality. - bool depthFirstVLocAndEmit( - unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation, - const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks, - LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, - SmallVectorImpl &AllTheVLocs, MachineFunction &MF, - DenseMap &AllVarsNumbering, - const TargetPassConfig &TPC); + bool depthFirstVLocAndEmit(unsigned MaxNumBlocks, + const ScopeToDILocT &ScopeToDILocation, + const ScopeToVarsT &ScopeToVars, + ScopeToAssignBlocksT &ScopeToBlocks, + LiveInsT &Output, FuncValueTable &MOutLocs, + FuncValueTable &MInLocs, + SmallVectorImpl &AllTheVLocs, + MachineFunction &MF, const TargetPassConfig &TPC); bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree, TargetPassConfig *TPC, unsigned InputBBLimit, @@ -1473,6 +1525,11 @@ class InstrRefBasedLDV : public LDVImpl { } std::optional findLocationForMemOperand(const MachineInstr &MI); + + // Utility for unit testing, don't use directly. + DebugVariableMap &getDVMap() { + return DVMap; + } }; } // namespace LiveDebugValues diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir index b54c748ac9e84..67bfd85dcb379 100644 --- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir +++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-fragments.mir @@ -17,12 +17,12 @@ # CHECK-LABEL: bb.3.bb3: # CHECK: DBG_VALUE $ecx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 0, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 0, 32) # CHECK-SAME: $ecx, $r8d # CHECK-NEXT: DBG_VALUE $ebx, $noreg, !{{[0-9]+}}, # CHECK-SAME: !DIExpression(DW_OP_LLVM_fragment, 32, 32) -# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, +# CHECK-NEXT: DBG_VALUE_LIST !{{[0-9]+}}, # CHECK-SAME: !DIExpression({{[^)]+}}, DW_OP_LLVM_fragment, 32, 32) # CHECK-SAME: $ebx, $r10d # CHECK-NEXT: XOR32rr diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp index 306a97c3149cc..28cfb3046bd47 100644 --- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp +++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp @@ -55,6 +55,7 @@ class InstrRefLDVTest : public testing::Test { DIBasicType *LongInt; DIExpression *EmptyExpr; LiveDebugValues::OverlapMap Overlaps; + LiveDebugValues::DebugVariableMap DVMap; DebugLoc OutermostLoc, InBlockLoc, NotNestedBlockLoc, InlinedLoc; @@ -176,7 +177,7 @@ class InstrRefLDVTest : public testing::Test { void addVTracker() { ASSERT_TRUE(LDV); - VTracker = std::make_unique(Overlaps, EmptyExpr); + VTracker = std::make_unique(DVMap, Overlaps, EmptyExpr); LDV->VTracker = &*VTracker; } @@ -215,7 +216,7 @@ class InstrRefLDVTest : public testing::Test { } void buildVLocValueMap(const DILocation *DILoc, - const SmallSet &VarsWeCareAbout, + const SmallSet &VarsWeCareAbout, SmallPtrSetImpl &AssignBlocks, InstrRefBasedLDV::LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs, @@ -2632,10 +2633,11 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) { MInLocs[0][0] = MOutLocs[0][0] = LiveInRsp; DebugVariable Var(FuncVariable, std::nullopt, nullptr); + DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc); DbgValueProperties EmptyProps(EmptyExpr, false, false); - SmallSet AllVars; - AllVars.insert(Var); + SmallSet AllVars; + AllVars.insert(VarID); // Mild hack: rather than constructing machine instructions in each block // and creating lexical scopes across them, instead just tell @@ -2645,7 +2647,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) { AssignBlocks.insert(MBB0); SmallVector VLocs; - VLocs.resize(1, VLocTracker(Overlaps, EmptyExpr)); + VLocs.resize(1, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr)); InstrRefBasedLDV::LiveInsT Output; @@ -2657,7 +2659,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) { // If we put an assignment in the transfer function, that should... well, // do nothing, because we don't store the live-outs. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output.size(), 0ul); @@ -2694,10 +2696,11 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { initValueArray(MOutLocs, 4, 2); DebugVariable Var(FuncVariable, std::nullopt, nullptr); + DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc); DbgValueProperties EmptyProps(EmptyExpr, false, false); - SmallSet AllVars; - AllVars.insert(Var); + SmallSet AllVars; + AllVars.insert(VarID); // Mild hack: rather than constructing machine instructions in each block // and creating lexical scopes across them, instead just tell @@ -2710,7 +2713,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { AssignBlocks.insert(MBB3); SmallVector VLocs; - VLocs.resize(4, VLocTracker(Overlaps, EmptyExpr)); + VLocs.resize(4, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr)); InstrRefBasedLDV::LiveInsT Output; @@ -2736,7 +2739,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // An assignment in the end block should also not affect other blocks; or // produce any live-ins. - VLocs[3].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[3].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2748,7 +2751,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // Assignments in either of the side-of-diamond blocks should also not be // propagated anywhere. VLocs[3].Vars.clear(); - VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2758,7 +2761,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { VLocs[2].Vars.clear(); ClearOutputs(); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2770,7 +2773,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // However: putting an assignment in the first block should propagate variable // values through to all other blocks, as it dominates. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2790,7 +2793,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // should still be propagated, as buildVLocValueMap shouldn't care about // what's in the registers (except for PHIs). // values through to all other blocks, as it dominates. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2808,8 +2811,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // We should get a live-in to the merging block, if there are two assigns of // the same value in either side of the diamond. - VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2824,8 +2827,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // If we assign a value in the entry block, then 'undef' on a branch, we // shouldn't have a live-in in the merge block. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(EmptyProps, DbgValue::Undef)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(EmptyProps, DbgValue::Undef)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2843,8 +2846,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // Having different values joining into the merge block should mean we have // no live-in in that block. Block ones LiveInRax value doesn't appear as a // live-in anywhere, it's block internal. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2862,8 +2865,8 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) { // But on the other hand, if there's a location in the register file where // those two values can be joined, do so. MOutLocs[1][0] = LiveInRax; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2915,14 +2918,15 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { initValueArray(MOutLocs, 3, 2); DebugVariable Var(FuncVariable, std::nullopt, nullptr); + DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc); DbgValueProperties EmptyProps(EmptyExpr, false, false); DIExpression *TwoOpExpr = DIExpression::get(Ctx, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_LLVM_arg, 1, dwarf::DW_OP_plus}); DbgValueProperties VariadicProps(TwoOpExpr, false, true); - SmallSet AllVars; - AllVars.insert(Var); + SmallSet AllVars; + AllVars.insert(VarID); SmallPtrSet AssignBlocks; AssignBlocks.insert(MBB0); @@ -2930,7 +2934,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { AssignBlocks.insert(MBB2); SmallVector VLocs; - VLocs.resize(3, VLocTracker(Overlaps, EmptyExpr)); + VLocs.resize(3, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr)); InstrRefBasedLDV::LiveInsT Output; @@ -2947,7 +2951,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { Output.resize(3); // Easy starter: a dominating assign should propagate to all blocks. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2963,7 +2967,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { // A variadic assignment should behave the same. DbgOpID Locs0[] = {LiveInRspID, LiveInRaxID}; - VLocs[0].Vars.insert({Var, DbgValue(Locs0, VariadicProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(Locs0, VariadicProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2979,8 +2983,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { VLocs[1].Vars.clear(); // Put an undef assignment in the loop. Should get no live-in value. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(EmptyProps, DbgValue::Undef)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(EmptyProps, DbgValue::Undef)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -2991,8 +2995,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { VLocs[1].Vars.clear(); // Assignment of the same value should naturally join. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3008,8 +3012,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { // Assignment of different values shouldn't join with no machine PHI vals. // Will be live-in to exit block as it's dominated. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3025,8 +3029,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { // with unrelated assign in loop block again. MInLocs[1][0] = RspPHIInBlk1; MOutLocs[1][0] = RspDefInBlk1; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3042,8 +3046,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { // find the appropriate PHI. MInLocs[1][0] = RspPHIInBlk1; MOutLocs[1][0] = RspDefInBlk1; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3063,8 +3067,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { MOutLocs[1][0] = LiveInRsp; MInLocs[1][1] = RaxPHIInBlk1; MOutLocs[1][1] = RspDefInBlk1; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3085,8 +3089,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { MOutLocs[1][0] = RspDefInBlk1; MInLocs[1][1] = RaxPHIInBlk1; MOutLocs[1][1] = RspDefInBlk1; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(RspDefInBlk1ID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(RspDefInBlk1ID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3119,8 +3123,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { MOutLocs[1][0] = RspPHIInBlk1; MInLocs[1][1] = LiveInRax; MOutLocs[1][1] = LiveInRax; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(RspPHIInBlk1ID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(RspPHIInBlk1ID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3138,8 +3142,8 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) { // because there's a def in it. MInLocs[1][0] = LiveInRsp; MOutLocs[1][0] = LiveInRsp; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3193,10 +3197,11 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { initValueArray(MOutLocs, 5, 2); DebugVariable Var(FuncVariable, std::nullopt, nullptr); + DebugVariableID VarID = LDV->getDVMap().insertDVID(Var, OutermostLoc); DbgValueProperties EmptyProps(EmptyExpr, false, false); - SmallSet AllVars; - AllVars.insert(Var); + SmallSet AllVars; + AllVars.insert(VarID); SmallPtrSet AssignBlocks; AssignBlocks.insert(MBB0); @@ -3206,7 +3211,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { AssignBlocks.insert(MBB4); SmallVector VLocs; - VLocs.resize(5, VLocTracker(Overlaps, EmptyExpr)); + VLocs.resize(5, VLocTracker(LDV->getDVMap(), Overlaps, EmptyExpr)); InstrRefBasedLDV::LiveInsT Output; @@ -3223,7 +3228,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { Output.resize(5); // A dominating assign should propagate to all blocks. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3244,8 +3249,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { // Test that an assign in the inner loop causes unresolved PHIs at the heads // of both loops, and no output location. Dominated blocks do get values. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3263,7 +3268,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { // Same test, but with no assignment in block 0. We should still get values // in dominated blocks. - VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3280,8 +3285,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { // Similarly, assignments in the outer loop gives location to dominated // blocks, but no PHI locations are found at the outer loop head. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[3].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[3].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3295,8 +3300,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { VLocs[0].Vars.clear(); VLocs[3].Vars.clear(); - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[1].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[1].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3317,8 +3322,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { // With an assignment of the same value in the inner loop, we should work out // that all PHIs can be eliminated and the same value is live-through the // whole function. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[2].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3348,8 +3353,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { // one. Even though RspPHIInBlk2 isn't available later in the function, we // should still produce a live-in value. The fact it's unavailable is a // different concern. - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[2].Vars.insert({Var, DbgValue(LiveInRaxID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(LiveInRaxID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); @@ -3374,8 +3379,8 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) { MOutLocs[2][0] = RspDefInBlk2; MInLocs[3][0] = RspDefInBlk2; MOutLocs[3][0] = RspDefInBlk2; - VLocs[0].Vars.insert({Var, DbgValue(LiveInRspID, EmptyProps)}); - VLocs[2].Vars.insert({Var, DbgValue(RspDefInBlk2ID, EmptyProps)}); + VLocs[0].Vars.insert({VarID, DbgValue(LiveInRspID, EmptyProps)}); + VLocs[2].Vars.insert({VarID, DbgValue(RspDefInBlk2ID, EmptyProps)}); buildVLocValueMap(OutermostLoc, AllVars, AssignBlocks, Output, MOutLocs, MInLocs, VLocs); EXPECT_EQ(Output[0].size(), 0ul); From 497ea1d84951626dea5bf644fef2d99e145e21ac Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Jul 2024 16:06:43 +0100 Subject: [PATCH 034/486] [DAG] tryToFoldExtendSelectLoad - reuse existing SDLoc. NFC. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 30203f9119af7..9b2153c68ccae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12812,13 +12812,11 @@ static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) { /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, - SelectionDAG &DAG, + SelectionDAG &DAG, const SDLoc &DL, CombineLevel Level) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - SDLoc DL(N); - assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND) && "Expected EXTEND dag node in input!"); @@ -13775,7 +13773,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT)); } - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level)) return Res; return SDValue(); @@ -13860,8 +13858,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { // fold (zext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x) if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) - return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, - N0.getOperand(0)); + return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, N0.getOperand(0)); // fold (zext (truncate x)) -> (zext x) or // (zext (truncate x)) -> (truncate x) @@ -14147,7 +14144,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { if (SDValue V = widenAbs(N, DAG)) return V; - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level)) return Res; // CSE zext nneg with sext if the zext is not free. @@ -14322,7 +14319,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { if (SDValue NewCtPop = widenCtPop(N, DAG)) return NewCtPop; - if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level)) + if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, DL, Level)) return Res; return SDValue(); From 2bdcfbe62cb9a08df4b58a17d44be0a3082df053 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Thu, 18 Jul 2024 19:29:51 +0400 Subject: [PATCH 035/486] [clang] Fix crash in concept deprecation (#98622) There is a gap between `getAs()` and `getConstrainedAutoType()` that the original patch #92295 was not aware of. Fixes #98164 --- clang/lib/Sema/SemaDecl.cpp | 8 ++++---- clang/lib/Sema/SemaType.cpp | 9 ++++----- clang/test/CXX/drs/cwg24xx.cpp | 9 +++++++++ clang/test/SemaCXX/cxx-deprecated.cpp | 9 +++++++++ 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 6c3589bf87433..bb25a0b3a45ae 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7436,10 +7436,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( tryToFixVariablyModifiedVarType(TInfo, R, D.getIdentifierLoc(), /*DiagID=*/0); - if (const AutoType *AutoT = R->getAs()) - CheckConstrainedAuto( - AutoT, - TInfo->getTypeLoc().getContainedAutoTypeLoc().getConceptNameLoc()); + if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) { + const AutoType *AT = TL.getTypePtr(); + CheckConstrainedAuto(AT, TL.getConceptNameLoc()); + } bool IsMemberSpecialization = false; bool IsVariableTemplateSpecialization = false; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index baac1fe4f2407..6fa39cdccef2b 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -6363,11 +6363,10 @@ TypeResult Sema::ActOnTypeName(Declarator &D) { CheckExtraCXXDefaultArguments(D); } - if (const AutoType *AutoT = T->getAs()) - CheckConstrainedAuto( - AutoT, - TInfo->getTypeLoc().getContainedAutoTypeLoc().getConceptNameLoc()); - + if (AutoTypeLoc TL = TInfo->getTypeLoc().getContainedAutoTypeLoc()) { + const AutoType *AT = TL.getTypePtr(); + CheckConstrainedAuto(AT, TL.getConceptNameLoc()); + } return CreateParsedType(T, TInfo); } diff --git a/clang/test/CXX/drs/cwg24xx.cpp b/clang/test/CXX/drs/cwg24xx.cpp index 16b8ec07fc50f..00b6bb5a865df 100644 --- a/clang/test/CXX/drs/cwg24xx.cpp +++ b/clang/test/CXX/drs/cwg24xx.cpp @@ -82,6 +82,15 @@ auto h() -> C auto { C auto foo = T(); // expected-warning@-1 {{'C' is deprecated}} // expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}} + C auto *bar = T(); + // expected-warning@-1 {{'C' is deprecated}} + // expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}} + C auto &baz = T(); + // expected-warning@-1 {{'C' is deprecated}} + // expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}} + C auto &&quux = T(); + // expected-warning@-1 {{'C' is deprecated}} + // expected-note@#cwg2428-C {{'C' has been explicitly marked deprecated here}} return foo; } #endif diff --git a/clang/test/SemaCXX/cxx-deprecated.cpp b/clang/test/SemaCXX/cxx-deprecated.cpp index 81eb07608300d..d7de609d58cdd 100644 --- a/clang/test/SemaCXX/cxx-deprecated.cpp +++ b/clang/test/SemaCXX/cxx-deprecated.cpp @@ -36,4 +36,13 @@ template // expected-warning@-1 {{'C' is deprecated}} // expected-note@#C {{'C' has been explicitly marked deprecated here}} void f(); + +namespace GH98164 { +template +auto b() = delete; // #b + +decltype(b<0>()) x; +// expected-error@-1 {{call to deleted function 'b'}} +// expected-note@#b {{candidate function [with $0 = 0] has been explicitly deleted}} +} // namespace GH98164 } // namespace cxx20_concept From a2d309912a2863dfe7286ffde67b968e8c720b07 Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Thu, 18 Jul 2024 16:34:16 +0100 Subject: [PATCH 036/486] [FMV][AArch64] Do not emit ifunc resolver on use. (#97761) It was raised in https://github.com/llvm/llvm-project/issues/81494 that we are not generating correct code when there is no TU-local caller. The suggestion was to emit a resolver: * Whenever there is a use in the TU. * When the TU has a definition of the default version. See the comment for more details: https://github.com/llvm/llvm-project/issues/81494#issuecomment-1985963497 This got addressed with https://github.com/llvm/llvm-project/pull/84405. Generating a resolver on use means that we may end up with multiple resolvers across different translation units. Those resolvers may not be the same because each translation unit may contain different version declarations (user's fault). Therefore the order of linking the final image determines which of these weak symbols gets selected, resulting in non consisted behavior. I am proposing to stop emitting a resolver on use and only do so in the translation unit which contains the default definition. This way we guarantee the existence of a single resolver. Now, when a versioned function is used we want to emit a declaration of the function symbol omitting the multiversion mangling. I have added a requirement to ACLE mandating that all the function versions are declared in the translation unit which contains the default definition: https://github.com/ARM-software/acle/pull/328 --- clang/lib/CodeGen/CodeGenModule.cpp | 97 ++- .../CodeGen/aarch64-fmv-resolver-emission.c | 111 +++ .../CodeGen/aarch64-mixed-target-attributes.c | 6 +- .../test/CodeGen/attr-target-clones-aarch64.c | 348 ++++---- clang/test/CodeGen/attr-target-version.c | 818 +++++++++--------- .../aarch64-fmv-resolver-emission.cpp | 111 +++ .../CodeGenCXX/attr-target-clones-aarch64.cpp | 210 ++--- clang/test/CodeGenCXX/attr-target-version.cpp | 247 +++--- clang/test/CodeGenCXX/fmv-namespace.cpp | 49 +- 9 files changed, 1117 insertions(+), 880 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-fmv-resolver-emission.c create mode 100644 clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 0c002b553e4c6..71192cb0e8c4a 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3796,8 +3796,7 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { // Forward declarations are emitted lazily on first use. if (!FD->doesThisDeclarationHaveABody()) { if (!FD->doesDeclarationForceExternallyVisibleDefinition() && - (!FD->isMultiVersion() || - !FD->getASTContext().getTargetInfo().getTriple().isAArch64())) + (!FD->isMultiVersion() || !getTarget().getTriple().isAArch64())) return; StringRef MangledName = getMangledName(GD); @@ -4191,23 +4190,6 @@ llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM, return llvm::GlobalValue::WeakODRLinkage; } -static FunctionDecl *createDefaultTargetVersionFrom(const FunctionDecl *FD) { - auto *DeclCtx = const_cast(FD->getDeclContext()); - TypeSourceInfo *TInfo = FD->getTypeSourceInfo(); - StorageClass SC = FD->getStorageClass(); - DeclarationName Name = FD->getNameInfo().getName(); - - FunctionDecl *NewDecl = - FunctionDecl::Create(FD->getASTContext(), DeclCtx, FD->getBeginLoc(), - FD->getEndLoc(), Name, TInfo->getType(), TInfo, SC); - - NewDecl->setIsMultiVersion(); - NewDecl->addAttr(TargetVersionAttr::CreateImplicit( - NewDecl->getASTContext(), "default", NewDecl->getSourceRange())); - - return NewDecl; -} - void CodeGenModule::emitMultiVersionFunctions() { std::vector MVFuncsToEmit; MultiVersionFuncs.swap(MVFuncsToEmit); @@ -4234,29 +4216,30 @@ void CodeGenModule::emitMultiVersionFunctions() { return cast(Func); }; - bool HasDefaultDecl = !FD->isTargetVersionMultiVersion(); - bool ShouldEmitResolver = - !getContext().getTargetInfo().getTriple().isAArch64(); + // For AArch64, a resolver is only emitted if a function marked with + // target_version("default")) or target_clones() is present and defined + // in this TU. For other architectures it is always emitted. + bool ShouldEmitResolver = !getTarget().getTriple().isAArch64(); SmallVector Options; getContext().forEachMultiversionedFunctionVersion( FD, [&](const FunctionDecl *CurFD) { llvm::SmallVector Feats; + bool IsDefined = CurFD->doesThisDeclarationHaveABody(); if (const auto *TA = CurFD->getAttr()) { TA->getAddedFeatures(Feats); llvm::Function *Func = createFunction(CurFD); Options.emplace_back(Func, TA->getArchitecture(), Feats); } else if (const auto *TVA = CurFD->getAttr()) { - bool HasDefaultDef = TVA->isDefaultVersion() && - CurFD->doesThisDeclarationHaveABody(); - HasDefaultDecl |= TVA->isDefaultVersion(); - ShouldEmitResolver |= (CurFD->isUsed() || HasDefaultDef); + if (TVA->isDefaultVersion() && IsDefined) + ShouldEmitResolver = true; TVA->getFeatures(Feats); llvm::Function *Func = createFunction(CurFD); Options.emplace_back(Func, /*Architecture*/ "", Feats); } else if (const auto *TC = CurFD->getAttr()) { - ShouldEmitResolver |= CurFD->doesThisDeclarationHaveABody(); + if (IsDefined) + ShouldEmitResolver = true; for (unsigned I = 0; I < TC->featuresStrs_size(); ++I) { if (!TC->isFirstOfVersion(I)) continue; @@ -4282,13 +4265,6 @@ void CodeGenModule::emitMultiVersionFunctions() { if (!ShouldEmitResolver) continue; - if (!HasDefaultDecl) { - FunctionDecl *NewFD = createDefaultTargetVersionFrom(FD); - llvm::Function *Func = createFunction(NewFD); - llvm::SmallVector Feats; - Options.emplace_back(Func, /*Architecture*/ "", Feats); - } - llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD); if (auto *IFunc = dyn_cast(ResolverConstant)) { ResolverConstant = IFunc->getResolver(); @@ -4339,6 +4315,14 @@ void CodeGenModule::emitMultiVersionFunctions() { emitMultiVersionFunctions(); } +static void replaceDeclarationWith(llvm::GlobalValue *Old, + llvm::Constant *New) { + assert(cast(Old)->isDeclaration() && "Not a declaration"); + New->takeName(Old); + Old->replaceAllUsesWith(New); + Old->eraseFromParent(); +} + void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) { const auto *FD = cast(GD.getDecl()); assert(FD && "Not a FunctionDecl?"); @@ -4443,12 +4427,9 @@ void CodeGenModule::emitCPUDispatchDefinition(GlobalDecl GD) { // Fix up function declarations that were created for cpu_specific before // cpu_dispatch was known if (!isa(IFunc)) { - assert(cast(IFunc)->isDeclaration()); auto *GI = llvm::GlobalIFunc::create(DeclTy, 0, Linkage, "", ResolverFunc, &getModule()); - GI->takeName(IFunc); - IFunc->replaceAllUsesWith(GI); - IFunc->eraseFromParent(); + replaceDeclarationWith(IFunc, GI); IFunc = GI; } @@ -4478,7 +4459,8 @@ void CodeGenModule::AddDeferredMultiVersionResolverToEmit(GlobalDecl GD) { } /// If a dispatcher for the specified mangled name is not in the module, create -/// and return an llvm Function with the specified type. +/// and return it. The dispatcher is either an llvm Function with the specified +/// type, or a global ifunc. llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { const auto *FD = cast(GD.getDecl()); assert(FD && "Not a FunctionDecl?"); @@ -4506,8 +4488,15 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { ResolverName += ".resolver"; } - // If the resolver has already been created, just return it. - if (llvm::GlobalValue *ResolverGV = GetGlobalValue(ResolverName)) + // If the resolver has already been created, just return it. This lookup may + // yield a function declaration instead of a resolver on AArch64. That is + // because we didn't know whether a resolver will be generated when we first + // encountered a use of the symbol named after this resolver. Therefore, + // targets which support ifuncs should not return here unless we actually + // found an ifunc. + llvm::GlobalValue *ResolverGV = GetGlobalValue(ResolverName); + if (ResolverGV && + (isa(ResolverGV) || !getTarget().supportsIFunc())) return ResolverGV; const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); @@ -4533,7 +4522,8 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { "", Resolver, &getModule()); GIF->setName(ResolverName); SetCommonAttributes(FD, GIF); - + if (ResolverGV) + replaceDeclarationWith(ResolverGV, GIF); return GIF; } @@ -4542,6 +4532,8 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) { assert(isa(Resolver) && "Resolver should be created for the first time"); SetCommonAttributes(FD, cast(Resolver)); + if (ResolverGV) + replaceDeclarationWith(ResolverGV, Resolver); return Resolver; } @@ -4571,6 +4563,7 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( ForDefinition_t IsForDefinition) { const Decl *D = GD.getDecl(); + std::string NameWithoutMultiVersionMangling; // Any attempts to use a MultiVersion function should result in retrieving // the iFunc instead. Name Mangling will handle the rest of the changes. if (const FunctionDecl *FD = cast_or_null(D)) { @@ -4592,14 +4585,24 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( if (FD->isMultiVersion()) { UpdateMultiVersionNames(GD, FD, MangledName); - if (FD->getASTContext().getTargetInfo().getTriple().isAArch64() && - !FD->isUsed()) - AddDeferredMultiVersionResolverToEmit(GD); - else if (!IsForDefinition) - return GetOrCreateMultiVersionResolver(GD); + if (!IsForDefinition) { + // On AArch64 we do not immediatelly emit an ifunc resolver when a + // function is used. Instead we defer the emission until we see a + // default definition. In the meantime we just reference the symbol + // without FMV mangling (it may or may not be replaced later). + if (getTarget().getTriple().isAArch64()) { + AddDeferredMultiVersionResolverToEmit(GD); + NameWithoutMultiVersionMangling = getMangledNameImpl( + *this, GD, FD, /*OmitMultiVersionMangling=*/true); + } else + return GetOrCreateMultiVersionResolver(GD); + } } } + if (!NameWithoutMultiVersionMangling.empty()) + MangledName = NameWithoutMultiVersionMangling; + // Lookup the entry, lazily creating it if necessary. llvm::GlobalValue *Entry = GetGlobalValue(MangledName); if (Entry) { diff --git a/clang/test/CodeGen/aarch64-fmv-resolver-emission.c b/clang/test/CodeGen/aarch64-fmv-resolver-emission.c new file mode 100644 index 0000000000000..eeafb3d41860d --- /dev/null +++ b/clang/test/CodeGen/aarch64-fmv-resolver-emission.c @@ -0,0 +1,111 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s + +// CHECK: @used_before_default_def = weak_odr ifunc void (), ptr @used_before_default_def.resolver +// CHECK: @used_after_default_def = weak_odr ifunc void (), ptr @used_after_default_def.resolver +// CHECK-NOT: @used_before_default_decl = weak_odr ifunc void (), ptr @used_before_default_decl.resolver +// CHECK-NOT: @used_after_default_decl = weak_odr ifunc void (), ptr @used_after_default_decl.resolver +// CHECK-NOT: @used_no_default = weak_odr ifunc void (), ptr @used_no_default.resolver +// CHECK-NOT: @not_used_no_default = weak_odr ifunc void (), ptr @not_used_no_default.resolver +// CHECK: @not_used_with_default = weak_odr ifunc void (), ptr @not_used_with_default.resolver + + +// Test that an ifunc is generated and used when the default +// version is defined after the first use of the function. +// +__attribute__((target_version("aes"))) void used_before_default_def(void) {} +// CHECK-LABEL: define dso_local void @used_before_default_def._Maes( +// +void call_before_def(void) { used_before_default_def(); } +// CHECK-LABEL: define dso_local void @call_before_def( +// CHECK: call void @used_before_default_def() +// +__attribute__((target_version("default"))) void used_before_default_def(void) {} +// CHECK-LABEL: define dso_local void @used_before_default_def.default( +// +// CHECK-NOT: declare void @used_before_default_def( + + +// Test that an ifunc is generated and used when the default +// version is defined before the first use of the function. +// +__attribute__((target_version("aes"))) void used_after_default_def(void) {} +// CHECK-LABEL: define dso_local void @used_after_default_def._Maes( +// +__attribute__((target_version("default"))) void used_after_default_def(void) {} +// CHECK-LABEL: define dso_local void @used_after_default_def.default( +// +void call_after_def(void) { used_after_default_def(); } +// CHECK-LABEL: define dso_local void @call_after_def( +// CHECK: call void @used_after_default_def() +// +// CHECK-NOT: declare void @used_after_default_def( + + +// Test that an unmagled declaration is generated and used when the +// default version is declared after the first use of the function. +// +__attribute__((target_version("aes"))) void used_before_default_decl(void) {} +// CHECK-LABEL: define dso_local void @used_before_default_decl._Maes( +// +void call_before_decl(void) { used_before_default_decl(); } +// CHECK-LABEL: define dso_local void @call_before_decl( +// CHECK: call void @used_before_default_decl() +// +__attribute__((target_version("default"))) void used_before_default_decl(void); +// CHECK: declare void @used_before_default_decl() + + +// Test that an unmagled declaration is generated and used when the +// default version is declared before the first use of the function. +// +__attribute__((target_version("aes"))) void used_after_default_decl(void) {} +// CHECK-LABEL: define dso_local void @used_after_default_decl._Maes( +// +__attribute__((target_version("default"))) void used_after_default_decl(void); +// CHECK: declare void @used_after_default_decl() +// +void call_after_decl(void) { used_after_default_decl(); } +// CHECK-LABEL: define dso_local void @call_after_decl( +// CHECK: call void @used_after_default_decl() + + +// Test that an unmagled declaration is generated and used when +// the default version is not present. +// +__attribute__((target_version("aes"))) void used_no_default(void) {} +// CHECK-LABEL: define dso_local void @used_no_default._Maes( +// +void call_no_default(void) { used_no_default(); } +// CHECK-LABEL: define dso_local void @call_no_default( +// CHECK: call void @used_no_default() +// +// CHECK: declare void @used_no_default() + + +// Test that neither an ifunc nor a declaration is generated if the default +// definition is missing since the versioned function is not used. +// +__attribute__((target_version("aes"))) void not_used_no_default(void) {} +// CHECK-LABEL: define dso_local void @not_used_no_default._Maes( +// +// CHECK-NOT: declare void @not_used_no_default( + + +// Test that an ifunc is generated if the default version is defined but not used. +// +__attribute__((target_version("aes"))) void not_used_with_default(void) {} +// CHECK-LABEL: define dso_local void @not_used_with_default._Maes( +// +__attribute__((target_version("default"))) void not_used_with_default(void) {} +// CHECK-LABEL: define dso_local void @not_used_with_default.default( +// +// CHECK-NOT: declare void @not_used_with_default( + + +// CHECK: define weak_odr ptr @used_before_default_def.resolver() +// CHECK: define weak_odr ptr @used_after_default_def.resolver() +// CHECK-NOT: define weak_odr ptr @used_before_default_decl.resolver( +// CHECK-NOT: define weak_odr ptr @used_after_default_decl.resolver( +// CHECK-NOT: define weak_odr ptr @used_no_default.resolver( +// CHECK-NOT: define weak_odr ptr @not_used_no_default.resolver( +// CHECK: define weak_odr ptr @not_used_with_default.resolver() diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c index 3c047fec6ceed..d779abd395b5f 100644 --- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c +++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c @@ -261,9 +261,9 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void // CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" } // CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } // CHECK: attributes #[[ATTR5:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR6:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR7:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.5a" } -// CHECK: attributes #[[ATTR8:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" } +// CHECK: attributes #[[ATTR6:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-v9.5a" } +// CHECK: attributes #[[ATTR7:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-v9.5a" } +// CHECK: attributes #[[ATTR8:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } //. // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } //. diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c index 60f9c7f1fc24e..846b08298cc72 100644 --- a/clang/test/CodeGen/attr-target-clones-aarch64.c +++ b/clang/test/CodeGen/attr-target-clones-aarch64.c @@ -32,8 +32,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver // CHECK: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver // CHECK: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver -// CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver // CHECK: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver +// CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver // CHECK: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver //. // CHECK-MTE-BTI: @__aarch64_cpu_features = external dso_local global { i64 } @@ -42,8 +42,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver // CHECK-MTE-BTI: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver // CHECK-MTE-BTI: @ftc_dup3 = weak_odr ifunc i32 (), ptr @ftc_dup3.resolver -// CHECK-MTE-BTI: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver // CHECK-MTE-BTI: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver +// CHECK-MTE-BTI: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver // CHECK-MTE-BTI: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver //. // CHECK: Function Attrs: noinline nounwind optnone @@ -210,12 +210,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: @ftc_inline2._Mfp16( -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: @ftc_direct( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 @@ -236,86 +230,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: ret i32 [[ADD5]] // // -// CHECK-LABEL: @ftc_inline1.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_inline1._MpredresMrcpc -// CHECK: resolver_else2: -// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 513 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513 -// CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] -// CHECK-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @ftc_inline1._MrngMsimd -// CHECK: resolver_else4: -// CHECK-NEXT: ret ptr @ftc_inline1.default -// -// -// CHECK-LABEL: @ftc_inline2.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @ftc_inline2._MfcmaMsve2-bitperm -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65536 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_inline2._Mfp16 -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @ftc_inline2.default -// -// -// CHECK-LABEL: @ftc_inline3.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @ftc_inline3._MsbMsve -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_inline3._Mbti -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @ftc_inline3.default -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: @ftc_inline2._MfcmaMsve2-bitperm( -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: @ftc.default( // CHECK-NEXT: entry: @@ -347,11 +261,45 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: @ftc_inline2._Mfp16( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 2 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: @ftc_inline2._MfcmaMsve2-bitperm( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 2 +// +// +// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: @ftc_inline2.default( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 2 // // +// CHECK-LABEL: @ftc_inline2.resolver( +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @ftc_inline2._MfcmaMsve2-bitperm +// CHECK: resolver_else: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65536 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK: resolver_return1: +// CHECK-NEXT: ret ptr @ftc_inline2._Mfp16 +// CHECK: resolver_else2: +// CHECK-NEXT: ret ptr @ftc_inline2.default +// +// // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: @ftc_inline1._MrngMsimd( // CHECK-NEXT: entry: @@ -376,6 +324,36 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: ret i32 1 // // +// CHECK-LABEL: @ftc_inline1.resolver( +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt +// CHECK: resolver_else: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK: resolver_return1: +// CHECK-NEXT: ret ptr @ftc_inline1._MpredresMrcpc +// CHECK: resolver_else2: +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 513 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513 +// CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] +// CHECK-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] +// CHECK: resolver_return3: +// CHECK-NEXT: ret ptr @ftc_inline1._MrngMsimd +// CHECK: resolver_else4: +// CHECK-NEXT: ret ptr @ftc_inline1.default +// +// // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: @ftc_inline3._Mbti( // CHECK-NEXT: entry: @@ -394,6 +372,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: ret i32 3 // // +// CHECK-LABEL: @ftc_inline3.resolver( +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @ftc_inline3._MsbMsve +// CHECK: resolver_else: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK: resolver_return1: +// CHECK-NEXT: ret ptr @ftc_inline3._Mbti +// CHECK: resolver_else2: +// CHECK-NEXT: ret ptr @ftc_inline3.default +// +// // CHECK-NOFMV: Function Attrs: noinline nounwind optnone // CHECK-NOFMV-LABEL: @ftc( // CHECK-NOFMV-NEXT: entry: @@ -624,12 +624,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone -// CHECK-MTE-BTI-LABEL: @ftc_inline2._Mfp16( -// CHECK-MTE-BTI-NEXT: entry: -// CHECK-MTE-BTI-NEXT: ret i32 2 -// -// -// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone // CHECK-MTE-BTI-LABEL: @ftc_direct( // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 4 @@ -650,86 +644,6 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: ret i32 [[ADD5]] // // -// CHECK-MTE-BTI-LABEL: @ftc_inline1.resolver( -// CHECK-MTE-BTI-NEXT: resolver_entry: -// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() -// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456 -// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456 -// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK-MTE-BTI: resolver_return: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt -// CHECK-MTE-BTI: resolver_else: -// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 -// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 -// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK-MTE-BTI: resolver_return1: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._MpredresMrcpc -// CHECK-MTE-BTI: resolver_else2: -// CHECK-MTE-BTI-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 513 -// CHECK-MTE-BTI-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513 -// CHECK-MTE-BTI-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] -// CHECK-MTE-BTI: resolver_return3: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._MrngMsimd -// CHECK-MTE-BTI: resolver_else4: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1.default -// -// -// CHECK-MTE-BTI-LABEL: @ftc_inline2.resolver( -// CHECK-MTE-BTI-NEXT: resolver_entry: -// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() -// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040 -// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040 -// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK-MTE-BTI: resolver_return: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2._MfcmaMsve2-bitperm -// CHECK-MTE-BTI: resolver_else: -// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65536 -// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536 -// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK-MTE-BTI: resolver_return1: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2._Mfp16 -// CHECK-MTE-BTI: resolver_else2: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2.default -// -// -// CHECK-MTE-BTI-LABEL: @ftc_inline3.resolver( -// CHECK-MTE-BTI-NEXT: resolver_entry: -// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() -// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488 -// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488 -// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK-MTE-BTI: resolver_return: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3._MsbMsve -// CHECK-MTE-BTI: resolver_else: -// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624 -// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624 -// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK-MTE-BTI: resolver_return1: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3._Mbti -// CHECK-MTE-BTI: resolver_else2: -// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3.default -// -// -// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone -// CHECK-MTE-BTI-LABEL: @ftc_inline2._MfcmaMsve2-bitperm( -// CHECK-MTE-BTI-NEXT: entry: -// CHECK-MTE-BTI-NEXT: ret i32 2 -// -// // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone // CHECK-MTE-BTI-LABEL: @ftc.default( // CHECK-MTE-BTI-NEXT: entry: @@ -761,11 +675,45 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone +// CHECK-MTE-BTI-LABEL: @ftc_inline2._Mfp16( +// CHECK-MTE-BTI-NEXT: entry: +// CHECK-MTE-BTI-NEXT: ret i32 2 +// +// +// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone +// CHECK-MTE-BTI-LABEL: @ftc_inline2._MfcmaMsve2-bitperm( +// CHECK-MTE-BTI-NEXT: entry: +// CHECK-MTE-BTI-NEXT: ret i32 2 +// +// +// CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone // CHECK-MTE-BTI-LABEL: @ftc_inline2.default( // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 2 // // +// CHECK-MTE-BTI-LABEL: @ftc_inline2.resolver( +// CHECK-MTE-BTI-NEXT: resolver_entry: +// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() +// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040 +// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040 +// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK-MTE-BTI: resolver_return: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2._MfcmaMsve2-bitperm +// CHECK-MTE-BTI: resolver_else: +// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65536 +// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536 +// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK-MTE-BTI: resolver_return1: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2._Mfp16 +// CHECK-MTE-BTI: resolver_else2: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline2.default +// +// // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone // CHECK-MTE-BTI-LABEL: @ftc_inline1._MrngMsimd( // CHECK-MTE-BTI-NEXT: entry: @@ -790,6 +738,36 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: ret i32 1 // // +// CHECK-MTE-BTI-LABEL: @ftc_inline1.resolver( +// CHECK-MTE-BTI-NEXT: resolver_entry: +// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() +// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456 +// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456 +// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK-MTE-BTI: resolver_return: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt +// CHECK-MTE-BTI: resolver_else: +// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 +// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 +// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK-MTE-BTI: resolver_return1: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._MpredresMrcpc +// CHECK-MTE-BTI: resolver_else2: +// CHECK-MTE-BTI-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 513 +// CHECK-MTE-BTI-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 513 +// CHECK-MTE-BTI-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] +// CHECK-MTE-BTI: resolver_return3: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1._MrngMsimd +// CHECK-MTE-BTI: resolver_else4: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline1.default +// +// // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone // CHECK-MTE-BTI-LABEL: @ftc_inline3._Mbti( // CHECK-MTE-BTI-NEXT: entry: @@ -807,6 +785,28 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-MTE-BTI-NEXT: entry: // CHECK-MTE-BTI-NEXT: ret i32 3 // +// +// CHECK-MTE-BTI-LABEL: @ftc_inline3.resolver( +// CHECK-MTE-BTI-NEXT: resolver_entry: +// CHECK-MTE-BTI-NEXT: call void @__init_cpu_features_resolver() +// CHECK-MTE-BTI-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488 +// CHECK-MTE-BTI-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488 +// CHECK-MTE-BTI-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK-MTE-BTI: resolver_return: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3._MsbMsve +// CHECK-MTE-BTI: resolver_else: +// CHECK-MTE-BTI-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-MTE-BTI-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624 +// CHECK-MTE-BTI-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624 +// CHECK-MTE-BTI-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-MTE-BTI-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] +// CHECK-MTE-BTI: resolver_return1: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3._Mbti +// CHECK-MTE-BTI: resolver_else2: +// CHECK-MTE-BTI-NEXT: ret ptr @ftc_inline3.default +// //. // CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" } // CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" } diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index 4edfc5408fae7..fbe34a51b40b6 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -11,15 +11,15 @@ int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; } int __attribute__((target_version("crc+ls64_v"))) fmv(void) { return 7; } int __attribute__((target_version("bti"))) fmv(void) { return 8; } int __attribute__((target_version("sme2"))) fmv(void) { return 9; } -int __attribute__((target_version("default"))) fmv(void); +int __attribute__((target_version("default"))) fmv(void) { return 0; } int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; } int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; } -int __attribute__((target_version("default"))) fmv_one(void); +int __attribute__((target_version("default"))) fmv_one(void) { return 0; } int __attribute__((target_version("fp"))) fmv_two(void) { return 1; } int __attribute__((target_version("simd"))) fmv_two(void) { return 2; } int __attribute__((target_version("dgh"))) fmv_two(void) { return 3; } int __attribute__((target_version("fp16+simd"))) fmv_two(void) { return 4; } -int __attribute__((target_version("default"))) fmv_two(void); +int __attribute__((target_version("default"))) fmv_two(void) { return 0; } int foo() { return fmv()+fmv_one()+fmv_two(); } @@ -124,11 +124,11 @@ __attribute__((target_version("rdma"))) int default_def_with_version_decls(void) // The following is guarded because in NOFMV we get errors for calling undeclared functions. #ifdef __HAVE_FUNCTION_MULTI_VERSIONING -// This should generate a default declaration, two target versions and the resolver. +// This should generate a default declaration, two target versions but no resolver. __attribute__((target_version("jscvt"))) int used_def_without_default_decl(void) { return 1; } __attribute__((target_version("rdma"))) int used_def_without_default_decl(void) { return 2; } -// This should generate a default declaration and the resolver. +// This should generate a default declaration but no resolver. __attribute__((target_version("jscvt"))) int used_decl_without_default_decl(void); __attribute__((target_version("rdma"))) int used_decl_without_default_decl(void); @@ -140,12 +140,10 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK: @fmv = weak_odr ifunc i32 (), ptr @fmv.resolver // CHECK: @fmv_one = weak_odr ifunc i32 (), ptr @fmv_one.resolver // CHECK: @fmv_two = weak_odr ifunc i32 (), ptr @fmv_two.resolver -// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver // CHECK: @fmv_e = weak_odr ifunc i32 (), ptr @fmv_e.resolver // CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver // CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver -// CHECK: @used_def_without_default_decl = weak_odr ifunc i32 (), ptr @used_def_without_default_decl.resolver -// CHECK: @used_decl_without_default_decl = weak_odr ifunc i32 (), ptr @used_decl_without_default_decl.resolver +// CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver // CHECK: @unused_with_default_def = weak_odr ifunc i32 (), ptr @unused_with_default_def.resolver // CHECK: @unused_with_implicit_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_default_def.resolver // CHECK: @unused_with_implicit_forward_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_forward_default_def.resolver @@ -215,6 +213,13 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv.default +// CHECK-SAME: () #[[ATTR9:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_one._Mls64Msimd // CHECK-SAME: () #[[ATTR5]] { // CHECK-NEXT: entry: @@ -229,6 +234,13 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_one.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp // CHECK-SAME: () #[[ATTR5]] { // CHECK-NEXT: entry: @@ -244,21 +256,28 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mdgh -// CHECK-SAME: () #[[ATTR11:[0-9]+]] { +// CHECK-SAME: () #[[ATTR9]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp16Msimd -// CHECK-SAME: () #[[ATTR12:[0-9]+]] { +// CHECK-SAME: () #[[ATTR11:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 // // // CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_two.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@foo -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR9]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv() // CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_one() @@ -268,6 +287,183 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret i32 [[ADD3]] // // +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_e.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 20 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_c.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@goo +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv_inline() +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_e() +// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_d() +// CHECK-NEXT: call void @fmv_c() +// CHECK-NEXT: [[CALL3:%.*]] = call i32 @fmv_default() +// CHECK-NEXT: ret i32 [[CALL3]] +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 111 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@recur +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @reca() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@hoo +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FP1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[FP2:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: call void @f(ptr noundef @fmv) +// CHECK-NEXT: store ptr @fmv, ptr [[FP1]], align 8 +// CHECK-NEXT: store ptr @fmv, ptr [[FP2]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 [[TMP0]]() +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8 +// CHECK-NEXT: [[CALL1:%.*]] = call i32 [[TMP1]]() +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] +// CHECK-NEXT: ret i32 [[ADD]] +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops +// CHECK-SAME: () #[[ATTR13:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod +// CHECK-SAME: () #[[ATTR14:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes +// CHECK-SAME: () #[[ATTR5]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve +// CHECK-SAME: () #[[ATTR15:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16 +// CHECK-SAME: () #[[ATTR11]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse +// CHECK-SAME: () #[[ATTR16:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm +// CHECK-SAME: () #[[ATTR17:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 0 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt +// CHECK-SAME: () #[[ATTR19:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm +// CHECK-SAME: () #[[ATTR17]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 2 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@caller +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call i32 @used_def_without_default_decl() +// CHECK-NEXT: [[CALL1:%.*]] = call i32 @used_decl_without_default_decl() +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] +// CHECK-NEXT: ret i32 [[ADD]] +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@main +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: call void @recur() +// CHECK-NEXT: [[CALL:%.*]] = call i32 @goo() +// CHECK-NEXT: ret i32 [[CALL]] +// +// // CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() @@ -406,57 +602,185 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv_two.default // // -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_e.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 20 +// CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @fmv_e._Mls64 +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @fmv_e.default // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb -// CHECK-SAME: () #[[ATTR13:[0-9]+]] { +// CHECK-SAME: () #[[ATTR20:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_d.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR9]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // +// CHECK-LABEL: define {{[^@]+}}@fmv_d.resolver() { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @fmv_d._Msb +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @fmv_d.default +// +// +// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] +// CHECK: resolver_return: +// CHECK-NEXT: ret ptr @fmv_c._Mssbs +// CHECK: resolver_else: +// CHECK-NEXT: ret ptr @fmv_c.default +// +// // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1 +// CHECK-SAME: () #[[ATTR21:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: ret void +// CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_c.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme +// CHECK-SAME: () #[[ATTR22:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: ret void +// CHECK-NEXT: ret i32 2 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@goo -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3 +// CHECK-SAME: () #[[ATTR23:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv_inline() -// CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_e() -// CHECK-NEXT: [[CALL2:%.*]] = call i32 @fmv_d() -// CHECK-NEXT: call void @fmv_c() -// CHECK-NEXT: [[CALL3:%.*]] = call i32 @fmv_default() -// CHECK-NEXT: ret i32 [[CALL3]] +// CHECK-NEXT: ret i32 12 // // -// CHECK-LABEL: define {{[^@]+}}@fmv_inline.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16 +// CHECK-SAME: () #[[ATTR24:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 8 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2 +// CHECK-SAME: () #[[ATTR25:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 6 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt +// CHECK-SAME: () #[[ATTR26:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 7 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc +// CHECK-SAME: () #[[ATTR27:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 3 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16 +// CHECK-SAME: () #[[ATTR28:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 4 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3 +// CHECK-SAME: () #[[ATTR29:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 5 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128 +// CHECK-SAME: () #[[ATTR30:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 9 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4 +// CHECK-SAME: () #[[ATTR31:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 10 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3 +// CHECK-SAME: () #[[ATTR32:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 11 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod +// CHECK-SAME: () #[[ATTR14]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 13 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd +// CHECK-SAME: () #[[ATTR4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 14 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4 +// CHECK-SAME: () #[[ATTR33:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 15 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm +// CHECK-SAME: () #[[ATTR34:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 16 +// +// +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@fmv_inline.default +// CHECK-SAME: () #[[ATTR9]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 3 +// +// +// CHECK-LABEL: define {{[^@]+}}@fmv_inline.resolver() comdat { +// CHECK-NEXT: resolver_entry: +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 4398048673856 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398048673856 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] @@ -587,355 +911,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv_inline.default // // -// CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_e._Mls64 -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @fmv_e.default -// -// -// CHECK-LABEL: define {{[^@]+}}@fmv_d.resolver() { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_d._Msb -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @fmv_d.default -// -// -// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @fmv_c._Mssbs -// CHECK: resolver_else: -// CHECK-NEXT: ret ptr @fmv_c.default -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 111 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@recur -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: call void @reca() -// CHECK-NEXT: ret void -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@hoo -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[FP1:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[FP2:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: call void @f(ptr noundef @fmv) -// CHECK-NEXT: store ptr @fmv, ptr [[FP1]], align 8 -// CHECK-NEXT: store ptr @fmv, ptr [[FP2]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[FP1]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call i32 [[TMP0]]() -// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[FP2]], align 8 -// CHECK-NEXT: [[CALL1:%.*]] = call i32 [[TMP1]]() -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] -// CHECK-NEXT: ret i32 [[ADD]] -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops -// CHECK-SAME: () #[[ATTR14:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod -// CHECK-SAME: () #[[ATTR15:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_default_decl._Maes -// CHECK-SAME: () #[[ATTR5]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve -// CHECK-SAME: () #[[ATTR16:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def._Mfp16 -// CHECK-SAME: () #[[ATTR12]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse -// CHECK-SAME: () #[[ATTR17:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm -// CHECK-SAME: () #[[ATTR18:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 0 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt -// CHECK-SAME: () #[[ATTR21:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm -// CHECK-SAME: () #[[ATTR18]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@caller -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 @used_def_without_default_decl() -// CHECK-NEXT: [[CALL1:%.*]] = call i32 @used_decl_without_default_decl() -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] -// CHECK-NEXT: ret i32 [[ADD]] -// -// -// CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @used_def_without_default_decl._Mjscvt -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 64 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 64 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @used_def_without_default_decl._Mrdm -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @used_def_without_default_decl.default -// -// -// CHECK-LABEL: define {{[^@]+}}@used_decl_without_default_decl.resolver() comdat { -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @used_decl_without_default_decl._Mjscvt -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 64 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 64 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @used_decl_without_default_decl._Mrdm -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @used_decl_without_default_decl.default -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@main -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 -// CHECK-NEXT: call void @recur() -// CHECK-NEXT: [[CALL:%.*]] = call i32 @goo() -// CHECK-NEXT: ret i32 [[CALL]] -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1 -// CHECK-SAME: () #[[ATTR22:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 1 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme -// CHECK-SAME: () #[[ATTR23:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 2 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3 -// CHECK-SAME: () #[[ATTR24:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 12 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16 -// CHECK-SAME: () #[[ATTR25:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 8 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2 -// CHECK-SAME: () #[[ATTR26:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 6 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt -// CHECK-SAME: () #[[ATTR27:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 7 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc -// CHECK-SAME: () #[[ATTR28:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 3 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16 -// CHECK-SAME: () #[[ATTR29:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 4 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3 -// CHECK-SAME: () #[[ATTR30:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 5 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128 -// CHECK-SAME: () #[[ATTR31:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 9 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4 -// CHECK-SAME: () #[[ATTR32:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 10 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3 -// CHECK-SAME: () #[[ATTR33:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 11 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod -// CHECK-SAME: () #[[ATTR15]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 13 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16fmlMsimd -// CHECK-SAME: () #[[ATTR4]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 14 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4 -// CHECK-SAME: () #[[ATTR34:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 15 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm -// CHECK-SAME: () #[[ATTR35:[0-9]+]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 16 -// -// -// CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline.default -// CHECK-SAME: () #[[ATTR11]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: ret i32 3 -// -// // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.resolver() comdat { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() @@ -1013,6 +988,27 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK-NOFMV: Function Attrs: noinline nounwind optnone +// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv +// CHECK-NOFMV-SAME: () #[[ATTR0]] { +// CHECK-NOFMV-NEXT: entry: +// CHECK-NOFMV-NEXT: ret i32 0 +// +// +// CHECK-NOFMV: Function Attrs: noinline nounwind optnone +// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one +// CHECK-NOFMV-SAME: () #[[ATTR0]] { +// CHECK-NOFMV-NEXT: entry: +// CHECK-NOFMV-NEXT: ret i32 0 +// +// +// CHECK-NOFMV: Function Attrs: noinline nounwind optnone +// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two +// CHECK-NOFMV-SAME: () #[[ATTR0]] { +// CHECK-NOFMV-NEXT: entry: +// CHECK-NOFMV-NEXT: ret i32 0 +// +// +// CHECK-NOFMV: Function Attrs: noinline nounwind optnone // CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_e // CHECK-NOFMV-SAME: () #[[ATTR0]] { // CHECK-NOFMV-NEXT: entry: @@ -1125,33 +1121,33 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } -// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } -// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" } -// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" } -// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } -// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" } -// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" } -// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" } -// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" } -// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" } +// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR12:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } +// CHECK: attributes #[[ATTR18:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR19]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" } +// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" } +// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" } +// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" } +// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" } +// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" } +// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" } +// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" } +// CHECK: attributes #[[ATTR35:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" } //. // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } diff --git a/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp b/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp new file mode 100644 index 0000000000000..79c07c0d9db11 --- /dev/null +++ b/clang/test/CodeGenCXX/aarch64-fmv-resolver-emission.cpp @@ -0,0 +1,111 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s + +// CHECK: @_Z23used_before_default_defv = weak_odr ifunc void (), ptr @_Z23used_before_default_defv.resolver +// CHECK: @_Z22used_after_default_defv = weak_odr ifunc void (), ptr @_Z22used_after_default_defv.resolver +// CHECK-NOT: @_Z24used_before_default_declv = weak_odr ifunc void (), ptr @_Z24used_before_default_declv.resolver +// CHECK-NOT: @_Z23used_after_default_declv = weak_odr ifunc void (), ptr @_Z23used_after_default_declv.resolver +// CHECK-NOT: @_Z15used_no_defaultv = weak_odr ifunc void (), ptr @_Z15used_no_defaultv.resolver +// CHECK-NOT: @_Z19not_used_no_defaultv = weak_odr ifunc void (), ptr @_Z19not_used_no_defaultv.resolver +// CHECK: @_Z21not_used_with_defaultv = weak_odr ifunc void (), ptr @_Z21not_used_with_defaultv.resolver + + +// Test that an ifunc is generated and used when the default +// version is defined after the first use of the function. +// +__attribute__((target_version("aes"))) void used_before_default_def(void) {} +// CHECK-LABEL: define dso_local void @_Z23used_before_default_defv._Maes( +// +void call_before_def(void) { used_before_default_def(); } +// CHECK-LABEL: define dso_local void @_Z15call_before_defv( +// CHECK: call void @_Z23used_before_default_defv() +// +__attribute__((target_version("default"))) void used_before_default_def(void) {} +// CHECK-LABEL: define dso_local void @_Z23used_before_default_defv.default( +// +// CHECK-NOT: declare void @_Z23used_before_default_defv( + + +// Test that an ifunc is generated and used when the default +// version is defined before the first use of the function. +// +__attribute__((target_version("aes"))) void used_after_default_def(void) {} +// CHECK-LABEL: define dso_local void @_Z22used_after_default_defv._Maes( +// +__attribute__((target_version("default"))) void used_after_default_def(void) {} +// CHECK-LABEL: define dso_local void @_Z22used_after_default_defv.default( +// +void call_after_def(void) { used_after_default_def(); } +// CHECK-LABEL: define dso_local void @_Z14call_after_defv( +// CHECK: call void @_Z22used_after_default_defv() +// +// CHECK-NOT: declare void @_Z22used_after_default_defv( + + +// Test that an unmagled declaration is generated and used when the +// default version is declared after the first use of the function. +// +__attribute__((target_version("aes"))) void used_before_default_decl(void) {} +// CHECK-LABEL: define dso_local void @_Z24used_before_default_declv._Maes( +// +void call_before_decl(void) { used_before_default_decl(); } +// CHECK-LABEL: define dso_local void @_Z16call_before_declv( +// CHECK: call void @_Z24used_before_default_declv() +// +__attribute__((target_version("default"))) void used_before_default_decl(void); +// CHECK: declare void @_Z24used_before_default_declv() + + +// Test that an unmagled declaration is generated and used when the +// default version is declared before the first use of the function. +// +__attribute__((target_version("aes"))) void used_after_default_decl(void) {} +// CHECK-LABEL: define dso_local void @_Z23used_after_default_declv._Maes( +// +__attribute__((target_version("default"))) void used_after_default_decl(void); +// CHECK: declare void @_Z23used_after_default_declv() +// +void call_after_decl(void) { used_after_default_decl(); } +// CHECK-LABEL: define dso_local void @_Z15call_after_declv( +// CHECK: call void @_Z23used_after_default_declv() + + +// Test that an unmagled declaration is generated and used when +// the default version is not present. +// +__attribute__((target_version("aes"))) void used_no_default(void) {} +// CHECK-LABEL: define dso_local void @_Z15used_no_defaultv._Maes( +// +void call_no_default(void) { used_no_default(); } +// CHECK-LABEL: define dso_local void @_Z15call_no_defaultv( +// CHECK: call void @_Z15used_no_defaultv() +// +// CHECK: declare void @_Z15used_no_defaultv() + + +// Test that neither an ifunc nor a declaration is generated if the default +// definition is missing since the versioned function is not used. +// +__attribute__((target_version("aes"))) void not_used_no_default(void) {} +// CHECK-LABEL: define dso_local void @_Z19not_used_no_defaultv._Maes( +// +// CHECK-NOT: declare void @_Z19not_used_no_defaultv( + + +// Test that an ifunc is generated if the default version is defined but not used. +// +__attribute__((target_version("aes"))) void not_used_with_default(void) {} +// CHECK-LABEL: define dso_local void @_Z21not_used_with_defaultv._Maes( +// +__attribute__((target_version("default"))) void not_used_with_default(void) {} +// CHECK-LABEL: define dso_local void @_Z21not_used_with_defaultv.default( +// +// CHECK-NOT: declare void @_Z21not_used_with_defaultv( + + +// CHECK: define weak_odr ptr @_Z23used_before_default_defv.resolver() +// CHECK: define weak_odr ptr @_Z22used_after_default_defv.resolver() +// CHECK-NOT: define weak_odr ptr @_Z24used_before_default_declv.resolver( +// CHECK-NOT: define weak_odr ptr @_Z23used_after_default_declv.resolver( +// CHECK-NOT: define weak_odr ptr @_Z15used_no_defaultv.resolver( +// CHECK-NOT: define weak_odr ptr @_Z19not_used_no_defaultv.resolver( +// CHECK: define weak_odr ptr @_Z21not_used_with_defaultv.resolver() diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp index 29ae6b6856500..6405621a9d647 100644 --- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp +++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s int __attribute__((target_clones("ls64_v+fp16", "default"))) foo_ovl(int) { return 1; } @@ -45,56 +45,60 @@ void run_foo_tml() { // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver //. -// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64_v( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16Mls64_v( +// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_Z7foo_ovli.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovli.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 4503599627436032 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4503599627436032 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_Z7foo_ovli._Mfp16Mls64_v -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z7foo_ovli.default // // -// CHECK-LABEL: @_Z7foo_ovlv._Mls64Mls64_accdata( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mls64Mls64_accdata( +// CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_Z7foo_ovlv.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_Z7foo_ovlv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 11258999068426240 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 11258999068426240 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_Z7foo_ovlv._Mls64Mls64_accdata -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z7foo_ovlv.default // // -// CHECK-LABEL: @_Z3barv( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3barv( +// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_Z7foo_ovli(i32 noundef 1) // CHECK-NEXT: [[CALL1:%.*]] = call noundef i32 @_Z7foo_ovlv() // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]] // CHECK-NEXT: ret i32 [[ADD]] // // -// CHECK-LABEL: @_Z11run_foo_tmlv( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local void @_Z11run_foo_tmlv( +// CHECK-SAME: ) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MC1:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1 // CHECK-NEXT: [[MC2:%.*]] = alloca [[STRUCT_MYCLASS_0:%.*]], align 1 // CHECK-NEXT: [[MC3:%.*]] = alloca [[STRUCT_MYCLASS_1:%.*]], align 1 @@ -106,131 +110,141 @@ void run_foo_tml() { // CHECK-NEXT: ret void // // -// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv._Mfrintts -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv.default -// -// -// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv._Mfrintts -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv.default -// -// -// CHECK-LABEL: @_ZN7MyClassIfsE7foo_tmlEv( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIfsE7foo_tmlEv( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 3 // // -// CHECK-LABEL: @_ZN7MyClassIdfE7foo_tmlEv( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIdfE7foo_tmlEv( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 4 // // -// CHECK-LABEL: @_Z7foo_ovli.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli.default( +// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_Z7foo_ovlv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv.default( +// CHECK-SAME: ) #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv._Mfrintts( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv._Mfrintts( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR3:[0-9]+]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4:[0-9]+]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIssE7foo_tmlEv.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv._Mfrintts( -// CHECK-NEXT: entry: +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIssE7foo_tmlEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]] +// CHECK: [[RESOLVER_RETURN1]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv._Mfrintts +// CHECK: [[RESOLVER_ELSE2]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv.default +// +// +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv._Mfrintts( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR3]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define linkonce_odr noundef i32 @_ZN7MyClassIisE7foo_tmlEv.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR2]] comdat { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 2 // +// +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClassIisE7foo_tmlEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]] +// CHECK: [[RESOLVER_RETURN1]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv._Mfrintts +// CHECK: [[RESOLVER_ELSE2]]: +// CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv.default +// //. -// CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } -// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" } -// CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" } -// CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } +// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } +// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" } +// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" } +// CHECK: attributes #[[ATTR4]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp index fd19f4c5a3030..6661abead20c6 100644 --- a/clang/test/CodeGenCXX/attr-target-version.cpp +++ b/clang/test/CodeGenCXX/attr-target-version.cpp @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s int __attribute__((target_version("sme-f64f64+bf16"))) foo(int) { return 1; } @@ -59,152 +59,169 @@ int bar() { return m.goo(1) + foo(1) + foo(); } - //. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } -// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver // CHECK: @_Z3fooi = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver // CHECK: @_Z3foov = weak_odr ifunc i32 (), ptr @_Z3foov.resolver +// CHECK: @_ZN7MyClass3gooEi = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver // CHECK: @_ZN7MyClass23unused_with_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass23unused_with_default_defEv.resolver // CHECK: @_ZN7MyClass32unused_with_implicit_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver // CHECK: @_ZN7MyClass40unused_with_implicit_forward_default_defEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver //. -// CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3fooi._Mbf16Msme-f64f64( +// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_Z3fooi.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3fooi.default( +// CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_Z3foov._Mebf16Msm4( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3foov._Mebf16Msm4( +// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 3 // // -// CHECK-LABEL: @_Z3foov.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3foov.default( +// CHECK-SAME: ) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 4 // // -// CHECK-LABEL: @_ZN7MyClass3gooEi.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClass3gooEi._Mcrc( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi._Mcrc( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass3gooEi._Mdotprod( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]], i32 noundef [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 -// CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 3 // // -// CHECK-LABEL: @_ZN7MyClass32unused_with_forward_default_declEv._Mmops( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_forward_default_declEv._Mmops( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass41unused_with_implicit_forward_default_declEv._Mdotprod( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass41unused_with_implicit_forward_default_declEv._Mdotprod( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR4]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass24unused_with_default_declEv._Maes( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass24unused_with_default_declEv._Maes( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR6:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv._Msve( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass23unused_with_default_defEv._Msve( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR7:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass23unused_with_default_defEv.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR8:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass32unused_with_implicit_default_defEv.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR9:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 1 // // -// CHECK-LABEL: @_ZN7MyClass22unused_without_defaultEv._Mrdm( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_ZN7MyClass22unused_without_defaultEv._Mrdm( +// CHECK-SAME: ptr noundef nonnull align 1 dereferenceable(1) [[THIS:%.*]]) #[[ATTR10:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK-NEXT: ret i32 0 // // -// CHECK-LABEL: @_Z3barv( -// CHECK-NEXT: entry: +// CHECK-LABEL: define dso_local noundef i32 @_Z3barv( +// CHECK-SAME: ) #[[ATTR1]] { +// CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1 // CHECK-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1) // CHECK-NEXT: [[CALL1:%.*]] = call noundef i32 @_Z3fooi(i32 noundef 1) @@ -214,109 +231,109 @@ int bar() { // CHECK-NEXT: ret i32 [[ADD3]] // // -// CHECK-LABEL: @_ZN7MyClass3gooEi.resolver( -// CHECK-NEXT: resolver_entry: -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1024 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi._Mcrc -// CHECK: resolver_else: -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16 -// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] -// CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] -// CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi._Mdotprod -// CHECK: resolver_else2: -// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi.default -// -// -// CHECK-LABEL: @_Z3fooi.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_Z3fooi.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_Z3fooi._Mbf16Msme-f64f64 -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z3fooi.default // // -// CHECK-LABEL: @_Z3foov.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_Z3foov.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 268435488 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_Z3foov._Mebf16Msm4 -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_Z3foov.default // // -// CHECK-LABEL: @_ZN7MyClass23unused_with_default_defEv.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass3gooEi.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1024 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi._Mcrc +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16 +// CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] +// CHECK-NEXT: br i1 [[TMP7]], label %[[RESOLVER_RETURN1:.*]], label %[[RESOLVER_ELSE2:.*]] +// CHECK: [[RESOLVER_RETURN1]]: +// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi._Mdotprod +// CHECK: [[RESOLVER_ELSE2]]: +// CHECK-NEXT: ret ptr @_ZN7MyClass3gooEi.default +// +// +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass23unused_with_default_defEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_ZN7MyClass23unused_with_default_defEv._Msve -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_ZN7MyClass23unused_with_default_defEv.default // // -// CHECK-LABEL: @_ZN7MyClass32unused_with_implicit_default_defEv.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass32unused_with_implicit_default_defEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65536 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv._Mfp16 -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_ZN7MyClass32unused_with_implicit_default_defEv.default // // -// CHECK-LABEL: @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver( -// CHECK-NEXT: resolver_entry: +// CHECK-LABEL: define weak_odr ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 128 // CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 128 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] -// CHECK: resolver_return: +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: // CHECK-NEXT: ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv._Mlse -// CHECK: resolver_else: +// CHECK: [[RESOLVER_ELSE]]: // CHECK-NEXT: ret ptr @_ZN7MyClass40unused_with_implicit_forward_default_defEv.default // //. -// CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } -// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" } -// CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" } -// CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" } -// CHECK: attributes #[[ATTR5:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" } -// CHECK: attributes #[[ATTR6:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } -// CHECK: attributes #[[ATTR7:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } -// CHECK: attributes #[[ATTR8:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } -// CHECK: attributes #[[ATTR9:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" } -// CHECK: attributes #[[ATTR10:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" } +// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } +// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" } +// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" } +// CHECK: attributes #[[ATTR4]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR5]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" } +// CHECK: attributes #[[ATTR6]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR7]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } +// CHECK: attributes #[[ATTR8]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } +// CHECK: attributes #[[ATTR9]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" } +// CHECK: attributes #[[ATTR10]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm" } // CHECK: attributes #[[ATTR11:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp index abfff1a74f86a..1ac88e68a3a12 100644 --- a/clang/test/CodeGenCXX/fmv-namespace.cpp +++ b/clang/test/CodeGenCXX/fmv-namespace.cpp @@ -26,7 +26,6 @@ __attribute((target_version("mops"))) int bar() { return 1; } //. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } // CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver -// CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver // CHECK: @_ZN3Foo3barEv = weak_odr ifunc i32 (), ptr @_ZN3Foo3barEv.resolver //. // CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve( @@ -42,20 +41,6 @@ __attribute((target_version("mops"))) int bar() { return 1; } // CHECK-NEXT: ret i32 [[CALL]] // // -// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat { -// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] -// CHECK: [[RESOLVER_RETURN]]: -// CHECK-NEXT: ret ptr @_ZN4Name3fooEv._Msve -// CHECK: [[RESOLVER_ELSE]]: -// CHECK-NEXT: ret ptr @_ZN4Name3fooEv.default -// -// // CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve( // CHECK-SAME: ) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -69,20 +54,6 @@ __attribute((target_version("mops"))) int bar() { return 1; } // CHECK-NEXT: ret i32 [[CALL]] // // -// CHECK-LABEL: define weak_odr ptr @_ZN9OtherName3fooEv.resolver() comdat { -// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] -// CHECK-NEXT: call void @__init_cpu_features_resolver() -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 -// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] -// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] -// CHECK: [[RESOLVER_RETURN]]: -// CHECK-NEXT: ret ptr @_ZN9OtherName3fooEv._Msve -// CHECK: [[RESOLVER_ELSE]]: -// CHECK-NEXT: ret ptr @_ZN9OtherName3fooEv.default -// -// // CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv.default( // CHECK-SAME: ) #[[ATTR1]] { // CHECK-NEXT: [[ENTRY:.*:]] @@ -90,7 +61,7 @@ __attribute((target_version("mops"))) int bar() { return 1; } // // // CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv._Mmops( -// CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: ) #[[ATTR3:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: ret i32 1 // @@ -101,6 +72,20 @@ __attribute((target_version("mops"))) int bar() { return 1; } // CHECK-NEXT: ret i32 0 // // +// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat { +// CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] +// CHECK-NEXT: call void @__init_cpu_features_resolver() +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 +// CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] +// CHECK-NEXT: br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]] +// CHECK: [[RESOLVER_RETURN]]: +// CHECK-NEXT: ret ptr @_ZN4Name3fooEv._Msve +// CHECK: [[RESOLVER_ELSE]]: +// CHECK-NEXT: ret ptr @_ZN4Name3fooEv.default +// +// // CHECK-LABEL: define weak_odr ptr @_ZN3Foo3barEv.resolver() comdat { // CHECK-NEXT: [[RESOLVER_ENTRY:.*:]] // CHECK-NEXT: call void @__init_cpu_features_resolver() @@ -117,8 +102,8 @@ __attribute((target_version("mops"))) int bar() { return 1; } //. // CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } // CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #[[ATTR2]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" } -// CHECK: attributes #[[ATTR3:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" } +// CHECK: attributes #[[ATTR3]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" } //. // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} From 9af3628ce7400a96205a4c4468867c3c11dd4b2f Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Thu, 18 Jul 2024 17:43:28 +0200 Subject: [PATCH 037/486] [SystemZ] Fix transparent_union calling convention The SystemZ ABI code was missing code to handle the transparent_union extension. Arguments of such types are specified to be passed like the first member of the union, instead of according to the usual ABI calling convention for aggregates. This did not make much difference in practice as the SystemZ ABI already specifies that 1-, 2-, 4- or 8-byte aggregates are passed in registers. However, there *is* a difference if the first member of the transparent union is a scalar integer type smaller than word size - if passed as a scalar, it needs to be zero- or sign-extended to word size, while if passed as aggregate, it is not. Fixed by adding code to handle transparent_union similar to what is done on other targets. --- clang/lib/CodeGen/Targets/SystemZ.cpp | 5 ++++- clang/test/CodeGen/SystemZ/systemz-abi.c | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index e6b63b6fe093f..4d61f51379346 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -412,13 +412,16 @@ ABIArgInfo SystemZABIInfo::classifyReturnType(QualType RetTy) const { } ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const { + // Handle transparent union types. + Ty = useFirstFieldIfTransparentUnion(Ty); + // Handle the generic C++ ABI. if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory); // Integers and enums are extended to full register width. if (isPromotableIntegerTypeForABI(Ty)) - return ABIArgInfo::getExtend(Ty); + return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty)); // Handle vector types and vector-like structure types. Note that // as opposed to float-like structure types, we do not allow any diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c index 65a2bc9bbb680..3e39425e57f17 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi.c @@ -173,6 +173,29 @@ union union_double pass_union_double(union union_double arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_union_double(ptr dead_on_unwind noalias writable sret(%union.union_double) align 8 %{{.*}}, i64 %{{.*}}) +// Verify that transparent unions are passed like their first member (but returned like a union) + +union tu_char { char a; } __attribute__((transparent_union)); +union tu_char pass_tu_char(union tu_char arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_tu_char(ptr dead_on_unwind noalias writable sret(%union.tu_char) align 1 %{{.*}}, i8 signext %{{.*}}) + +union tu_short { short a; } __attribute__((transparent_union)); +union tu_short pass_tu_short(union tu_short arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_tu_short(ptr dead_on_unwind noalias writable sret(%union.tu_short) align 2 %{{.*}}, i16 signext %{{.*}}) + +union tu_int { int a; } __attribute__((transparent_union)); +union tu_int pass_tu_int(union tu_int arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_tu_int(ptr dead_on_unwind noalias writable sret(%union.tu_int) align 4 %{{.*}}, i32 signext %{{.*}}) + +union tu_long { long a; } __attribute__((transparent_union)); +union tu_long pass_tu_long(union tu_long arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_tu_long(ptr dead_on_unwind noalias writable sret(%union.tu_long) align 8 %{{.*}}, i64 %{{.*}}) + +union tu_ptr { void *a; } __attribute__((transparent_union)); +union tu_ptr pass_tu_ptr(union tu_ptr arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_tu_ptr(ptr dead_on_unwind noalias writable sret(%union.tu_ptr) align 8 %{{.*}}, ptr %{{.*}}) + + // Accessing variable argument lists int va_int(__builtin_va_list l) { return __builtin_va_arg(l, int); } From 0c4023ae3b64c54ff51947e9776aee0e963c5635 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Jul 2024 08:47:06 -0700 Subject: [PATCH 038/486] [RISCV] Use Root instead of N throughout the worklist loop in combineBinOp_VLToVWBinOp_VL. (#99416) We were only checking that the node from the worklist is a supported root. We weren't checking the strategy or any of its operands unless it was the original node. For any other node, we just rechecked the original node's strategy and operands. The effect of this is that we don't do all of the transformations at once. Instead, when there were multiple possible nodes to transform we would only do them as each node was visited by the main DAG combine worklist. The test shows a case where we widened an instruction without removing all of the uses of the vsext. The sext is shared by one node that shares another sext node with the root another node that doesn't share anything with the root. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 ++--- .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 38 +++++++++++++++++++ 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 21193ebe1eb94..e938454b8e642 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14998,8 +14998,8 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget)) return SDValue(); - NodeExtensionHelper LHS(N, 0, DAG, Subtarget); - NodeExtensionHelper RHS(N, 1, DAG, Subtarget); + NodeExtensionHelper LHS(Root, 0, DAG, Subtarget); + NodeExtensionHelper RHS(Root, 1, DAG, Subtarget); auto AppendUsersIfNeeded = [&Worklist, &Inserted](const NodeExtensionHelper &Op) { if (Op.needToPromoteOtherUsers()) { @@ -15016,18 +15016,18 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, return SDValue(); SmallVector FoldingStrategies = - NodeExtensionHelper::getSupportedFoldings(N); + NodeExtensionHelper::getSupportedFoldings(Root); assert(!FoldingStrategies.empty() && "Nothing to be folded"); bool Matched = false; for (int Attempt = 0; - (Attempt != 1 + NodeExtensionHelper::isCommutative(N)) && !Matched; + (Attempt != 1 + NodeExtensionHelper::isCommutative(Root)) && !Matched; ++Attempt) { for (NodeExtensionHelper::CombineToTry FoldingStrategy : FoldingStrategies) { std::optional Res = - FoldingStrategy(N, LHS, RHS, DAG, Subtarget); + FoldingStrategy(Root, LHS, RHS, DAG, Subtarget); if (Res) { Matched = true; CombinesToApply.push_back(*Res); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index 9d63b8f31a3e8..feb0178569bc7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -882,3 +882,41 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) { %g = mul <2 x i64> %e, %f ret <2 x i64> %g } + +define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) { +; CHECK-LABEL: vwmul_v2i16_multiuse: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vle8.v v9, (a2) +; CHECK-NEXT: vle8.v v10, (a3) +; CHECK-NEXT: vle8.v v11, (a0) +; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vwmul.vv v13, v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v9, v12, v9 +; CHECK-NEXT: vdivu.vv v8, v12, v8 +; CHECK-NEXT: vor.vv v9, v13, v9 +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: ret + %a = load <2 x i8>, ptr %x + %b = load <2 x i8>, ptr %y + %c = load <2 x i8>, ptr %z + %d = load <2 x i8>, ptr %w + + %as = sext <2 x i8> %a to <2 x i16> + %bs = sext <2 x i8> %b to <2 x i16> + %cs = sext <2 x i8> %c to <2 x i16> + %ds = sext <2 x i8> %d to <2 x i16> + + %e = mul <2 x i16> %as, %ds + %f = mul <2 x i16> %bs, %ds ; shares 1 use with %e + %g = udiv <2 x i16> %bs, %cs ; shares 1 use with %f, and no uses with %e + + %h = or <2 x i16> %e, %f + %i = or <2 x i16> %h, %g + ret <2 x i16> %i +} From 342bd4b89355c27203b5f1abd8c43de6b01aba14 Mon Sep 17 00:00:00 2001 From: Ben Langmuir Date: Thu, 18 Jul 2024 08:54:43 -0700 Subject: [PATCH 039/486] [orc] Add the name of static archives to the name of their member objects (#99407) Changes "MyObj.o" to "/path/to/libMyLib.a(MyObj.o)". This allows us to differentiate between objects that have the same basename but came from different archives. It also fixes a bug where if two such objects were both linked and both have initializer sections their initializer symbol would cause a duplicate symbol error. rdar://131782514 --- .../llvm/ExecutionEngine/Orc/ExecutionUtils.h | 1 + .../ExecutionEngine/Orc/ExecutionUtils.cpp | 14 +++- .../MachO_archive_two_objects_same_name.s | 64 +++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h index ed30a792e9e9c..f997faf1ebcb0 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -328,6 +328,7 @@ class StaticLibraryDefinitionGenerator : public DefinitionGenerator { std::unique_ptr ArchiveBuffer; std::unique_ptr Archive; DenseMap ObjectFilesMap; + BumpPtrAllocator ObjFileNameStorage; }; /// A utility class to create COFF dllimport GOT symbols (__imp_*) and PLT diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 8a5986c1b88b1..c1a193f6a2802 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -17,6 +17,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/StringSaver.h" #include "llvm/Target/TargetMachine.h" #include @@ -422,6 +423,7 @@ Error StaticLibraryDefinitionGenerator::buildObjectFilesMap() { DenseMap MemoryBuffers; DenseSet Visited; DenseSet Excluded; + StringSaver FileNames(ObjFileNameStorage); for (auto &S : Archive->symbols()) { StringRef SymName = S.getName(); auto Member = S.getMember(); @@ -438,7 +440,17 @@ Error StaticLibraryDefinitionGenerator::buildObjectFilesMap() { Excluded.insert(DataOffset); continue; } - MemoryBuffers[DataOffset] = (*Child)->getMemoryBufferRef(); + + // Give members of the archive a name that contains the archive path so + // that they can be differentiated from a member with the same name in a + // different archive. This also ensure initializer symbols names will be + // unique within a JITDylib. + StringRef FullName = FileNames.save(Archive->getFileName() + "(" + + (*Child)->getFileName() + ")"); + MemoryBufferRef MemBuffer((*Child)->getMemoryBufferRef().getBuffer(), + FullName); + + MemoryBuffers[DataOffset] = MemBuffer; } if (!Excluded.count(DataOffset)) ObjectFilesMap[L.getExecutionSession().intern(SymName)] = diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s new file mode 100644 index 0000000000000..f2bd64f02ca80 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_archive_two_objects_same_name.s @@ -0,0 +1,64 @@ +# Check that the generated __inits symbol name does not clash between objects +# with the same base name in two different static archives. Otherwise we get a +# duplicate symbol error. + +# RUN: rm -rf %t && mkdir -p %t +# RUN: split-file %s %t + +# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \ +# RUN: -o %t/dir1/myobj.o %t/dir1/myobj.s +# RUN: llvm-ar crs %t/libmyobj1.a %t/dir1/myobj.o + +# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \ +# RUN: -o %t/dir2/myobj.o %t/dir2/myobj.s +# RUN: llvm-ar crs %t/libmyobj2.a %t/dir2/myobj.o + +# RUN: llvm-mc -triple x86_64-apple-macosx10.9 -filetype=obj \ +# RUN: -o %t/main.o %t/main.s + +# RUN: llvm-jitlink -noexec %t/main.o -lmyobj1 -lmyobj2 -L%t + +#--- dir1/myobj.s + .section __TEXT,__text,regular,pure_instructions + .build_version macos, 15, 0 sdk_version 15, 0 + .globl _myobj1 + .p2align 4, 0x90 +_myobj1: ## @f + retq + + .section __DATA,__mod_init_func,mod_init_funcs + .p2align 3, 0x0 + .quad _myobj1 + + .subsections_via_symbols + +#--- dir2/myobj.s + .section __TEXT,__text,regular,pure_instructions + .build_version macos, 15, 0 sdk_version 15, 0 + .globl _myobj2 + .p2align 4, 0x90 +_myobj2: ## @f + retq + + .section __DATA,__mod_init_func,mod_init_funcs + .p2align 3, 0x0 + .quad _myobj2 + + .subsections_via_symbols + +#--- main.s + + .section __TEXT,__text,regular,pure_instructions + + .globl _main + .p2align 4, 0x90 +_main: + pushq %rbp + movq %rsp, %rbp + callq _myobj1 + callq _myobj2 + xorl %eax, %eax + popq %rbp + retq + + .subsections_via_symbols From 10627d20044cb13d3fa60a3bce31d37edb3a591f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Jul 2024 09:02:17 -0700 Subject: [PATCH 040/486] Revert "[RISCV] Use Root instead of N throughout the worklist loop in combineBinOp_VLToVWBinOp_VL. (#99416)" This reverts commit 0c4023ae3b64c54ff51947e9776aee0e963c5635. I messed up re-generating the test after the change. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 ++--- .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 38 ------------------- 2 files changed, 5 insertions(+), 43 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index e938454b8e642..21193ebe1eb94 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14998,8 +14998,8 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget)) return SDValue(); - NodeExtensionHelper LHS(Root, 0, DAG, Subtarget); - NodeExtensionHelper RHS(Root, 1, DAG, Subtarget); + NodeExtensionHelper LHS(N, 0, DAG, Subtarget); + NodeExtensionHelper RHS(N, 1, DAG, Subtarget); auto AppendUsersIfNeeded = [&Worklist, &Inserted](const NodeExtensionHelper &Op) { if (Op.needToPromoteOtherUsers()) { @@ -15016,18 +15016,18 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, return SDValue(); SmallVector FoldingStrategies = - NodeExtensionHelper::getSupportedFoldings(Root); + NodeExtensionHelper::getSupportedFoldings(N); assert(!FoldingStrategies.empty() && "Nothing to be folded"); bool Matched = false; for (int Attempt = 0; - (Attempt != 1 + NodeExtensionHelper::isCommutative(Root)) && !Matched; + (Attempt != 1 + NodeExtensionHelper::isCommutative(N)) && !Matched; ++Attempt) { for (NodeExtensionHelper::CombineToTry FoldingStrategy : FoldingStrategies) { std::optional Res = - FoldingStrategy(Root, LHS, RHS, DAG, Subtarget); + FoldingStrategy(N, LHS, RHS, DAG, Subtarget); if (Res) { Matched = true; CombinesToApply.push_back(*Res); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index feb0178569bc7..9d63b8f31a3e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -882,41 +882,3 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) { %g = mul <2 x i64> %e, %f ret <2 x i64> %g } - -define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) { -; CHECK-LABEL: vwmul_v2i16_multiuse: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: vle8.v v9, (a2) -; CHECK-NEXT: vle8.v v10, (a3) -; CHECK-NEXT: vle8.v v11, (a0) -; CHECK-NEXT: vsext.vf2 v12, v8 -; CHECK-NEXT: vsext.vf2 v8, v9 -; CHECK-NEXT: vsext.vf2 v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vwmul.vv v13, v11, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vmul.vv v9, v12, v9 -; CHECK-NEXT: vdivu.vv v8, v12, v8 -; CHECK-NEXT: vor.vv v9, v13, v9 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: ret - %a = load <2 x i8>, ptr %x - %b = load <2 x i8>, ptr %y - %c = load <2 x i8>, ptr %z - %d = load <2 x i8>, ptr %w - - %as = sext <2 x i8> %a to <2 x i16> - %bs = sext <2 x i8> %b to <2 x i16> - %cs = sext <2 x i8> %c to <2 x i16> - %ds = sext <2 x i8> %d to <2 x i16> - - %e = mul <2 x i16> %as, %ds - %f = mul <2 x i16> %bs, %ds ; shares 1 use with %e - %g = udiv <2 x i16> %bs, %cs ; shares 1 use with %f, and no uses with %e - - %h = or <2 x i16> %e, %f - %i = or <2 x i16> %h, %g - ret <2 x i16> %i -} From 0ce11a1a763d46e4afe678f3f94a1932c1dcfe5d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 18 Jul 2024 18:04:19 +0200 Subject: [PATCH 041/486] [libc++] Add a release note about C++03 being frozen after LLVM 21 (#95894) Co-authored-by: Louis Dionne --- libcxx/docs/ReleaseNotes/19.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 36cb23dfde6c9..624550f998858 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -169,8 +169,13 @@ LLVM 20 LLVM 21 ~~~~~~~ -TODO +- The status of the C++03 implementation will be frozen after the LLVM 21 release. This means that starting in LLVM 22, non-critical bug fixes may not be back-ported + to C++03, including LWG issues. C++03 is a legacy platform, where most projects are no longer actively maintained. To + reduce the amount of fixes required to keep such legacy projects compiling with up-to-date toolchains, libc++ will aim to freeze the status of the headers in C++03 mode to avoid unintended breaking changes. + See https://discourse.llvm.org/t/rfc-freezing-c-03-headers-in-libc for more details. + + If you are using C++03 in your project, you should consider moving to a newer version of the Standard to get the most out of libc++. ABI Affecting Changes --------------------- From 2bf91db0c743f041c9f83609399f75654c07445a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 18 Jul 2024 18:05:06 +0200 Subject: [PATCH 042/486] [libc++] Use char_traits::copy while inserting when possible (#97201) This reduces the number of asm lines from 707 to 519 for this snippet: ```c++ auto test(std::string& str, const char* begin, const char* end) { str.insert(str.begin(), begin, end); } ``` While that's not a performance metric, I've never seen a use of `memcpy` result in a performance regression for any realistic usage. --- libcxx/include/string | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index 90394e9edbe83..54e9d8990c220 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -1870,6 +1870,23 @@ private: template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iterator __first, _Sentinel __last); + // Copy [__first, __last) into [__dest, __dest + (__last - __first)). Assumes that the ranges don't overlap. + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static value_type* + __copy_non_overlapping_range(_ForwardIter __first, _Sent __last, value_type* __dest) { +#ifndef _LIBCPP_CXX03_LANG + if constexpr (__libcpp_is_contiguous_iterator<_ForwardIter>::value && + is_same>::value && is_same<_ForwardIter, _Sent>::value) { + traits_type::copy(__dest, std::__to_address(__first), __last - __first); + return __dest + (__last - __first); + } +#endif + + for (; __first != __last; ++__first) + traits_type::assign(*__dest++, *__first); + return __dest; + } + template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 iterator __insert_from_safe_copy(size_type __n, size_type __ip, _ForwardIterator __first, _Sentinel __last) { @@ -1889,8 +1906,7 @@ private: __sz += __n; __set_size(__sz); traits_type::assign(__p[__sz], value_type()); - for (__p += __ip; __first != __last; ++__p, ++__first) - traits_type::assign(*__p, *__first); + __copy_non_overlapping_range(__first, __last, __p + __ip); return begin() + __ip; } @@ -2405,9 +2421,8 @@ basic_string<_CharT, _Traits, _Allocator>::__init_with_size(_InputIterator __fir #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { #endif // _LIBCPP_HAS_NO_EXCEPTIONS - for (; __first != __last; ++__first, (void)++__p) - traits_type::assign(*__p, *__first); - traits_type::assign(*__p, value_type()); + auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__p)); + traits_type::assign(*__end, value_type()); #ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (...) { if (__is_long()) @@ -2873,10 +2888,8 @@ basic_string<_CharT, _Traits, _Allocator>::append(_ForwardIterator __first, _For if (__cap - __sz < __n) __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0); __annotate_increase(__n); - pointer __p = __get_pointer() + __sz; - for (; __first != __last; ++__p, (void)++__first) - traits_type::assign(*__p, *__first); - traits_type::assign(*__p, value_type()); + auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__get_pointer() + __sz)); + traits_type::assign(*__end, value_type()); __set_size(__sz + __n); } else { const basic_string __temp(__first, __last, __alloc()); From 77ac07444d32668d5826ef27c24180fb10425213 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Jul 2024 09:10:52 -0700 Subject: [PATCH 043/486] Re-commit "[RISCV] Use Root instead of N throughout the worklist loop in combineBinOp_VLToVWBinOp_VL. (#99416)" With correct test update. Original message: We were only checking that the node from the worklist is a supported root. We weren't checking the strategy or any of its operands unless it was the original node. For any other node, we just rechecked the original node's strategy and operands. The effect of this is that we don't do all of the transformations at once. Instead, when there were multiple possible nodes to transform we would only do them as each node was visited by the main DAG combine worklist. The test shows a case where we widened an instruction without removing all of the uses of the vsext. The sext is shared by one node that shares another sext node with the root another node that doesn't share anything with the root. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 10 ++--- .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll | 37 +++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 21193ebe1eb94..e938454b8e642 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14998,8 +14998,8 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, if (!NodeExtensionHelper::isSupportedRoot(Root, Subtarget)) return SDValue(); - NodeExtensionHelper LHS(N, 0, DAG, Subtarget); - NodeExtensionHelper RHS(N, 1, DAG, Subtarget); + NodeExtensionHelper LHS(Root, 0, DAG, Subtarget); + NodeExtensionHelper RHS(Root, 1, DAG, Subtarget); auto AppendUsersIfNeeded = [&Worklist, &Inserted](const NodeExtensionHelper &Op) { if (Op.needToPromoteOtherUsers()) { @@ -15016,18 +15016,18 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, return SDValue(); SmallVector FoldingStrategies = - NodeExtensionHelper::getSupportedFoldings(N); + NodeExtensionHelper::getSupportedFoldings(Root); assert(!FoldingStrategies.empty() && "Nothing to be folded"); bool Matched = false; for (int Attempt = 0; - (Attempt != 1 + NodeExtensionHelper::isCommutative(N)) && !Matched; + (Attempt != 1 + NodeExtensionHelper::isCommutative(Root)) && !Matched; ++Attempt) { for (NodeExtensionHelper::CombineToTry FoldingStrategy : FoldingStrategies) { std::optional Res = - FoldingStrategy(N, LHS, RHS, DAG, Subtarget); + FoldingStrategy(Root, LHS, RHS, DAG, Subtarget); if (Res) { Matched = true; CombinesToApply.push_back(*Res); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index 9d63b8f31a3e8..97c7f101c2582 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -882,3 +882,40 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) { %g = mul <2 x i64> %e, %f ret <2 x i64> %g } + +define <2 x i16> @vwmul_v2i16_multiuse(ptr %x, ptr %y, ptr %z, ptr %w) { +; CHECK-LABEL: vwmul_v2i16_multiuse: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vle8.v v10, (a2) +; CHECK-NEXT: vle8.v v11, (a3) +; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vsext.vf2 v10, v11 +; CHECK-NEXT: vmul.vv v11, v12, v10 +; CHECK-NEXT: vmul.vv v10, v8, v10 +; CHECK-NEXT: vdivu.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v9, v11, v10 +; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: ret + %a = load <2 x i8>, ptr %x + %b = load <2 x i8>, ptr %y + %c = load <2 x i8>, ptr %z + %d = load <2 x i8>, ptr %w + + %as = sext <2 x i8> %a to <2 x i16> + %bs = sext <2 x i8> %b to <2 x i16> + %cs = sext <2 x i8> %c to <2 x i16> + %ds = sext <2 x i8> %d to <2 x i16> + + %e = mul <2 x i16> %as, %ds + %f = mul <2 x i16> %bs, %ds ; shares 1 use with %e + %g = udiv <2 x i16> %bs, %cs ; shares 1 use with %f, and no uses with %e + + %h = or <2 x i16> %e, %f + %i = or <2 x i16> %h, %g + ret <2 x i16> %i +} From c0c157a51832a2c7bbd09a449e33cc94d7747abf Mon Sep 17 00:00:00 2001 From: Sayhaan Siddiqui <49014204+sayhaan@users.noreply.github.com> Date: Thu, 18 Jul 2024 09:24:46 -0700 Subject: [PATCH 044/486] [BOLT][DWARF][NFC] Remove DWO ranges base (#99284) Removes getters and setters for DWO ranges base due to it not being used. --- bolt/include/bolt/Rewrite/DWARFRewriter.h | 9 --------- bolt/lib/Rewrite/DWARFRewriter.cpp | 1 - 2 files changed, 10 deletions(-) diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h index abd18b56113b6..b798c5b76fc28 100644 --- a/bolt/include/bolt/Rewrite/DWARFRewriter.h +++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h @@ -95,9 +95,6 @@ class DWARFRewriter { std::mutex LocListDebugInfoPatchesMutex; - /// Dwo id specific its RangesBase. - std::unordered_map DwoRangesBase; - std::unordered_map LineTablePatchMap; std::unordered_map TypeUnitRelocMap; @@ -191,12 +188,6 @@ class DWARFRewriter { /// Update stmt_list for CUs based on the new .debug_line \p Layout. void updateLineTableOffsets(const MCAssembler &Asm); - uint64_t getDwoRangesBase(uint64_t DWOId) { return DwoRangesBase[DWOId]; } - - void setDwoRangesBase(uint64_t DWOId, uint64_t RangesBase) { - DwoRangesBase[DWOId] = RangesBase; - } - using OverriddenSectionsMap = std::unordered_map; /// Output .dwo files. void writeDWOFiles(DWARFUnit &, const OverriddenSectionsMap &, diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 042c39a574561..4ba6344925856 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -719,7 +719,6 @@ void DWARFRewriter::updateDebugInfo() { } else { TempRangesSectionWriter = LegacyRangesWritersByCU[*DWOId].get(); RangesBase = RangesSectionWriter->getSectionOffset(); - setDwoRangesBase(*DWOId, *RangesBase); } updateUnitDebugInfo(*(*SplitCU), DWODIEBuilder, DebugLocDWoWriter, From 8c8e0ddae96882247717b8ae1739abcf09726eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Thu, 18 Jul 2024 18:23:45 +0200 Subject: [PATCH 045/486] [clang][Interp][test] Use fixed triple in cxx11 test This uses 'long', which has a different size on Windows. The test I copied this from also uses x86_64-linux. This should fix the bot: https://lab.llvm.org/buildbot/#/builders/81/builds/853 --- clang/test/AST/Interp/cxx11.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp index c0b88f0e567e0..92ab9b605f30d 100644 --- a/clang/test/AST/Interp/cxx11.cpp +++ b/clang/test/AST/Interp/cxx11.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=both,expected -std=c++11 %s -// RUN: %clang_cc1 -verify=both,ref -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-linux -fexperimental-new-constant-interpreter -verify=both,expected -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-linux -verify=both,ref -std=c++11 %s namespace IntOrEnum { const int k = 0; From fe04aafe6c27f32ad4ba38e552d06d14431cb2de Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Thu, 18 Jul 2024 22:06:44 +0530 Subject: [PATCH 046/486] [MLIR][Affine] NFC. Expose affine loop tiling validity utility (#99459) Move the utility to check for the validity of tiling affine loop nests to affine loop utils and expose for users outside the loop tiling pass or downstream users. --- .../Dialect/Affine/Analysis/LoopAnalysis.h | 10 +++ .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 58 ++++++++++++++++ .../Dialect/Affine/Transforms/LoopTiling.cpp | 67 +------------------ .../Dialect/Affine/loop-tiling-validity.mlir | 2 +- 4 files changed, 71 insertions(+), 66 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index 7b92b930fb5f5..ed3c21d952a01 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -109,6 +109,16 @@ bool isVectorizableLoopBody(AffineForOp loop, int *memRefDim, // the support. bool isOpwiseShiftValid(AffineForOp forOp, ArrayRef shifts); +/// Checks whether hyper-rectangular loop tiling of the nest represented by +/// `loops` is valid. The validity condition is from Irigoin and Triolet, +/// which states that two tiles cannot depend on each other. We simplify such +/// condition to just checking whether there is any negative dependence +/// direction, since we have the prior knowledge that the tiling results will be +/// hyper-rectangles, which are scheduled in the lexicographically increasing +/// order on the vector of loop indices. This function will return failure when +/// any dependence component is negative along any of `loops`. +bool isTilingValid(ArrayRef loops); + } // namespace affine } // namespace mlir diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 82ba8fc5ccbc1..411b5efb36cab 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" +#include "llvm/Support/Debug.h" #include #include #include @@ -30,6 +31,8 @@ using namespace mlir; using namespace mlir::affine; +#define DEBUG_TYPE "affine-loop-analysis" + /// Returns the trip count of the loop as an affine expression if the latter is /// expressible as an affine expression, and nullptr otherwise. The trip count /// expression is simplified before returning. This method only utilizes map @@ -390,3 +393,58 @@ bool mlir::affine::isOpwiseShiftValid(AffineForOp forOp, } return true; } + +bool mlir::affine::isTilingValid(ArrayRef loops) { + assert(!loops.empty() && "no original loops provided"); + + // We first find out all dependences we intend to check. + SmallVector loadAndStoreOps; + loops[0]->walk([&](Operation *op) { + if (isa(op)) + loadAndStoreOps.push_back(op); + }); + + unsigned numOps = loadAndStoreOps.size(); + unsigned numLoops = loops.size(); + for (unsigned d = 1; d <= numLoops + 1; ++d) { + for (unsigned i = 0; i < numOps; ++i) { + Operation *srcOp = loadAndStoreOps[i]; + MemRefAccess srcAccess(srcOp); + for (unsigned j = 0; j < numOps; ++j) { + Operation *dstOp = loadAndStoreOps[j]; + MemRefAccess dstAccess(dstOp); + + SmallVector depComps; + DependenceResult result = checkMemrefAccessDependence( + srcAccess, dstAccess, d, /*dependenceConstraints=*/nullptr, + &depComps); + + // Skip if there is no dependence in this case. + if (!hasDependence(result)) + continue; + + // Check whether there is any negative direction vector in the + // dependence components found above, which means that dependence is + // violated by the default hyper-rect tiling method. + LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " + "for dependence at depth: " + << Twine(d) << " between:\n";); + LLVM_DEBUG(srcAccess.opInst->dump()); + LLVM_DEBUG(dstAccess.opInst->dump()); + for (const DependenceComponent &depComp : depComps) { + if (depComp.lb.has_value() && depComp.ub.has_value() && + *depComp.lb < *depComp.ub && *depComp.ub < 0) { + LLVM_DEBUG(llvm::dbgs() + << "Dependence component lb = " << Twine(*depComp.lb) + << " ub = " << Twine(*depComp.ub) + << " is negative at depth: " << Twine(d) + << " and thus violates the legality rule.\n"); + return false; + } + } + } + } + } + + return true; +} diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp index 2650a06d198ea..c8400dfe8cd5c 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -93,69 +93,6 @@ static void adjustToDivisorsOfTripCounts(ArrayRef band, } } -/// Checks whether hyper-rectangular loop tiling of the nest represented by -/// `origLoops` is valid. The validity condition is from Irigoin and Triolet, -/// which states that two tiles cannot depend on each other. We simplify such -/// condition to just checking whether there is any negative dependence -/// direction, since we have the prior knowledge that the tiling results will be -/// hyper-rectangles, which are scheduled in the lexicographically increasing -/// order on the vector of loop indices. This function will return failure when -/// any dependence component is negative along any of `origLoops`. -static bool checkTilingLegality(MutableArrayRef origLoops) { - assert(!origLoops.empty() && "no original loops provided"); - - // We first find out all dependences we intend to check. - SmallVector loadAndStoreOps; - origLoops[0]->walk([&](Operation *op) { - if (isa(op)) - loadAndStoreOps.push_back(op); - }); - - unsigned numOps = loadAndStoreOps.size(); - unsigned numLoops = origLoops.size(); - for (unsigned d = 1; d <= numLoops + 1; ++d) { - for (unsigned i = 0; i < numOps; ++i) { - Operation *srcOp = loadAndStoreOps[i]; - MemRefAccess srcAccess(srcOp); - for (unsigned j = 0; j < numOps; ++j) { - Operation *dstOp = loadAndStoreOps[j]; - MemRefAccess dstAccess(dstOp); - - SmallVector depComps; - DependenceResult result = checkMemrefAccessDependence( - srcAccess, dstAccess, d, /*dependenceConstraints=*/nullptr, - &depComps); - - // Skip if there is no dependence in this case. - if (!hasDependence(result)) - continue; - - // Check whether there is any negative direction vector in the - // dependence components found above, which means that dependence is - // violated by the default hyper-rect tiling method. - LLVM_DEBUG(llvm::dbgs() << "Checking whether tiling legality violated " - "for dependence at depth: " - << Twine(d) << " between:\n";); - LLVM_DEBUG(srcAccess.opInst->dump();); - LLVM_DEBUG(dstAccess.opInst->dump();); - for (const DependenceComponent &depComp : depComps) { - if (depComp.lb.has_value() && depComp.ub.has_value() && - *depComp.lb < *depComp.ub && *depComp.ub < 0) { - LLVM_DEBUG(llvm::dbgs() - << "Dependence component lb = " << Twine(*depComp.lb) - << " ub = " << Twine(*depComp.ub) - << " is negative at depth: " << Twine(d) - << " and thus violates the legality rule.\n"); - return false; - } - } - } - } - } - - return true; -} - // Returns tile sizes to use. Checks CL options; if none are specified, sets it // based on a simple model that looks at the memory footprint and determines // tile sizes assuming identity accesses / 1:1 tile size proportional footprint @@ -242,8 +179,8 @@ void LoopTiling::runOnOperation() { // Tile each band. for (auto &band : bands) { - if (!checkTilingLegality(band)) { - band.front().emitRemark("tiling code is illegal due to dependences"); + if (!isTilingValid(band)) { + band.front().emitRemark("tiling nest is invalid due to dependences"); continue; } diff --git a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir index d1b80520ca7fe..e2c3832f695cc 100644 --- a/mlir/test/Dialect/Affine/loop-tiling-validity.mlir +++ b/mlir/test/Dialect/Affine/loop-tiling-validity.mlir @@ -34,7 +34,7 @@ func.func @illegal_loop_with_diag_dependence() { %A = memref.alloc() : memref<64x64xf32> affine.for %i = 0 to 64 { - // expected-remark@above {{tiling code is illegal due to dependences}} + // expected-remark@above {{tiling nest is invalid due to dependences}} affine.for %j = 0 to 64 { %0 = affine.load %A[%j, %i] : memref<64x64xf32> %1 = affine.load %A[%i, %j - 1] : memref<64x64xf32> From 06ab30b57450694818dbb649dec2a687f44df7f4 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 18 Jul 2024 09:40:37 -0700 Subject: [PATCH 047/486] [AMDGPU] Constant folding of llvm.amdgcn.trig.preop (#98562) If the parameters(the input and segment select) coming in to amdgcn.trig.preop intrinsic are compile time constants, we pre-compute the output of amdgcn.trig.preop on the CPU and replaces the uses with the computed constant. This work extends the patch https://reviews.llvm.org/D120150 to make it a complete coverage. For the segment select, only src1[4:0] are used. A segment select is invalid if we are selecting the 53-bit segment beyond the [1200:0] range of the 2/PI table. 0 is returned when a segment select is not valid. --- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 72 +++++ .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 291 ++++++++++++++---- 2 files changed, 302 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 93bca4402ed23..9197404309663 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1102,6 +1102,78 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_trig_preop: { + // The intrinsic is declared with name mangling, but currently the + // instruction only exists for f64 + if (!II.getType()->isDoubleTy()) + break; + + Value *Src = II.getArgOperand(0); + Value *Segment = II.getArgOperand(1); + if (isa(Src) || isa(Segment)) + return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); + + if (isa(Src)) { + auto *QNaN = ConstantFP::get( + II.getType(), APFloat::getQNaN(II.getType()->getFltSemantics())); + return IC.replaceInstUsesWith(II, QNaN); + } + + const ConstantFP *Csrc = dyn_cast(Src); + if (!Csrc) + break; + + if (II.isStrictFP()) + break; + + const APFloat &Fsrc = Csrc->getValueAPF(); + if (Fsrc.isNaN()) { + auto *Quieted = ConstantFP::get(II.getType(), Fsrc.makeQuiet()); + return IC.replaceInstUsesWith(II, Quieted); + } + + const ConstantInt *Cseg = dyn_cast(Segment); + if (!Cseg) + break; + + unsigned Exponent = (Fsrc.bitcastToAPInt().getZExtValue() >> 52) & 0x7ff; + unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue(); + unsigned Shift = SegmentVal * 53; + if (Exponent > 1077) + Shift += Exponent - 1077; + + // 2.0/PI table. + static const uint32_t TwoByPi[] = { + 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041, + 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, + 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, + 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, + 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, + 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, + 0x56033046}; + + // Return 0 for outbound segment (hardware behavior). + unsigned Idx = Shift >> 5; + if (Idx + 2 >= std::size(TwoByPi)) { + APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics()); + return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero)); + } + + unsigned BShift = Shift & 0x1f; + uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]); + uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0); + if (BShift) + Thi = (Thi << BShift) | (Tlo >> (64 - BShift)); + Thi = Thi >> 11; + APFloat Result = APFloat((double)Thi); + + int Scale = -53 - Shift; + if (Exponent >= 1968) + Scale += 128; + + Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven); + return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result)); + } case Intrinsic::amdgcn_fmul_legacy: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 59118a172a2bc..9cb79b2644865 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -5608,8 +5608,7 @@ declare float @llvm.amdgcn.trig.preop.f32(float, i32) define double @trig_preop_constfold_variable_undef_arg(i32 %arg) { ; CHECK-LABEL: @trig_preop_constfold_variable_undef_arg( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 [[ARG:%.*]]) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %val = call double @llvm.amdgcn.trig.preop.f64(double undef, i32 %arg) ret double %val @@ -5617,8 +5616,7 @@ define double @trig_preop_constfold_variable_undef_arg(i32 %arg) { define double @trig_preop_constfold_variable_poison_arg(i32 %arg) { ; CHECK-LABEL: @trig_preop_constfold_variable_poison_arg( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 [[ARG:%.*]]) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double poison ; %val = call double @llvm.amdgcn.trig.preop.f64(double poison, i32 %arg) ret double %val @@ -5635,8 +5633,7 @@ define double @trig_preop_constfold_variable_arg_undef(double %arg) { define double @trig_preop_constfold_variable_arg_poison(double %arg) { ; CHECK-LABEL: @trig_preop_constfold_variable_arg_poison( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double [[ARG:%.*]], i32 poison) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double poison ; %val = call double @llvm.amdgcn.trig.preop.f64(double %arg, i32 poison) ret double %val @@ -5653,8 +5650,7 @@ define double @trig_preop_constfold_variable_int(i32 %arg) { define double @trig_preop_qnan(i32 %arg) { ; CHECK-LABEL: @trig_preop_qnan( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 [[ARG:%.*]]) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0x7FF8000000000000 ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF8000000000000, i32 %arg) ret double %val @@ -5662,8 +5658,7 @@ define double @trig_preop_qnan(i32 %arg) { define double @trig_preop_snan(i32 %arg) { ; CHECK-LABEL: @trig_preop_snan( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 [[ARG:%.*]]) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0x7FF8000000000001 ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000001, i32 %arg) ret double %val @@ -5671,8 +5666,7 @@ define double @trig_preop_snan(i32 %arg) { define double @trig_preop_inf_0() { ; CHECK-LABEL: @trig_preop_inf_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0xB43DD63F5F2F8BD ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7FF0000000000000, i32 0) ret double %val @@ -5680,8 +5674,7 @@ define double @trig_preop_inf_0() { define double @trig_preop_ninf_0() { ; CHECK-LABEL: @trig_preop_ninf_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0xB43DD63F5F2F8BD ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0xFFF0000000000000, i32 0) ret double %val @@ -5707,10 +5700,36 @@ define double @trig_preop_variable_args(double %arg0, i32 %arg1) { define double @trig_preop_constfold() { ; CHECK-LABEL: @trig_preop_constfold( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) -; CHECK-NEXT: ret double [[VAL]] +; CHECK-NEXT: ret double 0x394A6EE06DB14ACC +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 2) + ret double %val +} + +; src1[4:0] <= 21 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) + %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 22) + ret double %val +} + +; Only use src1[4:0], so segment is actually 31 for -1. +define double @trig_preop_constfold_neg1_segment() { +; CHECK-LABEL: @trig_preop_constfold_neg1_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -1) + ret double %val +} + +; Only use src1[4:0], so segment is actually 0 for -32. +define double @trig_preop_constfold_neg32_segment() { +; CHECK-LABEL: @trig_preop_constfold_neg32_segment( +; CHECK-NEXT: ret double 0x3FE45F306DC9C882 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 -32) ret double %val } @@ -5723,84 +5742,234 @@ define double @trig_preop_constfold_strictfp() strictfp { ret double %val } -define double @trig_preop_constfold_0.0__0() { -; CHECK-LABEL: @trig_preop_constfold_0.0__0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 0) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissa0__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment0( +; CHECK-NEXT: ret double 0x3FE45F306DC9C882 ; %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 0) ret double %val } -define double @trig_preop_constfold_0.0__1() { -; CHECK-LABEL: @trig_preop_constfold_0.0__1( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 1) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissa1__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment0( +; CHECK-NEXT: ret double 0x3FE45F306DC9C882 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 1) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 0) ret double %val } -define double @trig_preop_constfold_0.0__neg1() { -; CHECK-LABEL: @trig_preop_constfold_0.0__neg1( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -1) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissaX__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment0( +; CHECK-NEXT: ret double 0x3FE45F306DC9C882 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -1) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x0004A7F09D5F47D4, i32 0) ret double %val } -define double @trig_preop_constfold_0.0__9999999() { -; CHECK-LABEL: @trig_preop_constfold_0.0__9999999( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 9999999) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissa0__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__segment2( +; CHECK-NEXT: ret double 0x394A6EE06DB14ACC ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 9999999) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 2) ret double %val } -define double @trig_preop_constfold_0.0__neg999999() { -; CHECK-LABEL: @trig_preop_constfold_0.0__neg999999( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0.000000e+00, i32 -999999) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissa1__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__segment2( +; CHECK-NEXT: ret double 0x394A6EE06DB14ACC ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 -999999) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 2) ret double %val } -define double @trig_preop_constfold_0x0020000000000000_0() { -; CHECK-LABEL: @trig_preop_constfold_0x0020000000000000_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x10000000000000, i32 0) -; CHECK-NEXT: ret double [[VAL]] +define double @trig_preop_constfold_exponent0_mantissaX__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__segment2( +; CHECK-NEXT: ret double 0x394A6EE06DB14ACC ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0x0010000000000000, i32 0) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x00094A6EE06DB14A, i32 2) ret double %val } -define double @trig_preop_constfold_0x001fffffffffffff_0() { -; CHECK-LABEL: @trig_preop_constfold_0x001fffffffffffff_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0xFFFFFFFFFFFFF, i32 0) -; CHECK-NEXT: ret double [[VAL]] +; src1[4:0] <= 21 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent0_mantissa0__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa0__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000fffffffffffff, i32 0) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0.0, i32 22) ret double %val } -define double @trig_preop_constfold_0x8020000000000000_0() { -; CHECK-LABEL: @trig_preop_constfold_0x8020000000000000_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0) -; CHECK-NEXT: ret double [[VAL]] +; src1[4:0] <= 21 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent0_mantissa1__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissa1__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0x8020000000000000, i32 0) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000FFFFFFFFFFFFF, i32 22) ret double %val } -define double @trig_preop_constfold_0x801fffffffffffff_0() { -; CHECK-LABEL: @trig_preop_constfold_0x801fffffffffffff_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 0x801FFFFFFFFFFFFF, i32 0) -; CHECK-NEXT: ret double [[VAL]] +; src1[4:0] <= 21 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent0_mantissaX__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent0_mantissaX__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x000A6EE06DB14ACC, i32 22) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissa0__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment0( +; CHECK-NEXT: ret double 0x1EC8135A2FBF209C +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 0) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissa1__segment1() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment1( +; CHECK-NEXT: ret double 0x1EC8135A2FBF209C +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 0) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissaX__segment1() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment1( +; CHECK-NEXT: ret double 0x1EC8135A2FBF209C +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6471B791D6398353, i32 0) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissa0__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__segment2( +; CHECK-NEXT: ret double 0x181272117E2EF7E4 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 2) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissa1__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__segment2( +; CHECK-NEXT: ret double 0x181272117E2EF7E4 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 2) + ret double %val +} + +; 1607 = 1077 + 10 * 53 +define double @trig_preop_constfold_exponent1607_mantissaX__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__segment2( +; CHECK-NEXT: ret double 0x181272117E2EF7E4 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647272117E2EF7E4, i32 2) + ret double %val +} + +; src1[4:0] <= 11 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1607_mantissa0__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa0__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x6470000000000000, i32 12) + ret double %val +} + +; src1[4:0] <= 11 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1607_mantissa1__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissa1__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647FFFFFFFFFFFFF, i32 12) + ret double %val +} + +; src1[4:0] <= 11 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1607_mantissaX__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1607_mantissaX__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x647181272117E2EF, i32 12) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissa0__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment0( +; CHECK-NEXT: ret double 0x10374F463F669E5F +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 0) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissa1__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment0( +; CHECK-NEXT: ret double 0x10374F463F669E5F +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 0) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissax__segment0() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissax__segment0( +; CHECK-NEXT: ret double 0x10374F463F669E5F +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B074F463F669E5F, i32 0) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissa0__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__segment2( +; CHECK-NEXT: ret double 0x98F2F8BD9E839CE +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 2) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissa1__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__segment2( +; CHECK-NEXT: ret double 0x98F2F8BD9E839CE +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 2) + ret double %val +} + +define double @trig_preop_constfold_exponent1968_mantissaX__segment2() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__segment2( +; CHECK-NEXT: ret double 0x98F2F8BD9E839CE +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0A2F8BD9E839CE, i32 2) + ret double %val +} + +; src1[4:0] <= 4 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1968_mantissa0__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa0__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B00000000000000, i32 5) + ret double %val +} + +; src1[4:0] <= 4 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1968_mantissa1__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissa1__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 +; + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0FFFFFFFFFFFFF, i32 5) + ret double %val +} + +; src1[4:0] <= 4 for segment to be inbound with this exponent of src0. +define double @trig_preop_constfold_exponent1968_mantissaX__outbound_segment() { +; CHECK-LABEL: @trig_preop_constfold_exponent1968_mantissaX__outbound_segment( +; CHECK-NEXT: ret double 0.000000e+00 ; - %val = call double @llvm.amdgcn.trig.preop.f64(double 0x801fffffffffffff, i32 0) + %val = call double @llvm.amdgcn.trig.preop.f64(double 0x7B0A98F2F8BD9E83, i32 5) ret double %val } From 371777695fe1b5407753ef2232d1b73014d3e501 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 18 Jul 2024 17:43:08 +0100 Subject: [PATCH 048/486] [LV] Assert uniform recipes don't get predicated for when vectorizing. Add assertion ensuring invariant on construction, split off as suggested from https://github.com/llvm/llvm-project/pull/98892. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ca798a8b2d89..748db418fee8c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8578,6 +8578,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, BlockInMask = getBlockInMask(I->getParent()); } + // Note that there is some custom logic to mark some intrinsics as uniform + // manually above for scalable vectors, which this assert needs to account for + // as well. + assert((Range.Start.isScalar() || !IsUniform || !IsPredicated || + (Range.Start.isScalable() && isa(I))) && + "Should not predicate a uniform recipe"); auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(I->operands()), IsUniform, BlockInMask); return Recipe; From d06b55e7934635049d55efff2dc9e745f911240c Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Thu, 18 Jul 2024 18:46:24 +0200 Subject: [PATCH 049/486] [Flang][Runtime] Improve runtime implementation of the RENAME intrinsic (#99445) The RENAME implementation in the Fortran runtime had a few glitches that had to be addressed: - Wrong usage of RTDECL (fixed) - Issue fatal error when trying to use RENAME on a target device (fixed) --- flang/runtime/misc-intrinsic.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/flang/runtime/misc-intrinsic.cpp b/flang/runtime/misc-intrinsic.cpp index 2f7fcd2e2341f..f7d893829fc0d 100644 --- a/flang/runtime/misc-intrinsic.cpp +++ b/flang/runtime/misc-intrinsic.cpp @@ -56,10 +56,10 @@ static RT_API_ATTRS void TransferImpl(Descriptor &result, extern "C" { RT_EXT_API_GROUP_BEGIN -void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2, +void RTDEF(Rename)(const Descriptor &path1, const Descriptor &path2, const Descriptor *status, const char *sourceFile, int line) { Terminator terminator{sourceFile, line}; - +#if !defined(RT_DEVICE_COMPILATION) char *pathSrc{EnsureNullTerminated( path1.OffsetElement(), path1.ElementBytes(), terminator)}; char *pathDst{EnsureNullTerminated( @@ -84,6 +84,9 @@ void RTDECL(Rename)(const Descriptor &path1, const Descriptor &path2, if (pathDst != path2.OffsetElement()) { FreeMemory(pathDst); } +#else // !defined(RT_DEVICE_COMPILATION) + terminator.Crash("RENAME intrinsic is only supported on host devices"); +#endif // !defined(RT_DEVICE_COMPILATION) } void RTDEF(Transfer)(Descriptor &result, const Descriptor &source, From 3d69bbc35158822c9e1371b5c37a24213a8a81fc Mon Sep 17 00:00:00 2001 From: Dominik Steenken Date: Thu, 18 Jul 2024 18:57:25 +0200 Subject: [PATCH 050/486] Allow MAY(R)? to accept the high components of register pairs (#98606) The HFP instructions `MAY` and `MAYR`, unlike any other floating point instructions, allow the specification of a 128bit register pair by either the lower-numbered or the higher-numbered component register. In order to support this, but change as little about codegen as possible, the existing `MAY(R)?` definition is made `CodeGenOnly`, while a copy is provided for the assembler and disassembler, which simply accepts a 64bit floating point register in place of the 128bit one. This copy is stripped of its pattern to prevent codegen from using it. The corresponding assembly tests that checked the register specification rule that this commit removes from `MAY(R)?` have also been removed. --- llvm/lib/Target/SystemZ/SystemZInstrHFP.td | 14 ++++++++++++-- llvm/test/MC/SystemZ/insn-bad.s | 8 -------- llvm/test/MC/SystemZ/insn-good.s | 4 ++++ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td index d2e05b63c6c63..ea194a38090db 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrHFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrHFP.td @@ -209,13 +209,23 @@ def MYH : BinaryRXF<"myh", 0xED3D, null_frag, FP64, FP64, z_load, 8>; def MYL : BinaryRXF<"myl", 0xED39, null_frag, FP64, FP64, z_load, 8>; // Fused multiply-add (unnormalized). -def MAYR : TernaryRRD<"mayr", 0xB33A, null_frag, FP128, FP64>; def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64, FP64>; def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64, FP64>; -def MAY : TernaryRXF<"may", 0xED3A, null_frag, FP128, FP64, z_load, 8>; def MAYH : TernaryRXF<"mayh", 0xED3C, null_frag, FP64, FP64, z_load, 8>; def MAYL : TernaryRXF<"mayl", 0xED38, null_frag, FP64, FP64, z_load, 8>; +// MAY and MAYR allow the user to specify the floating point register pair +// making up the FP128 register by either the lower-numbered register or the +// higher-numbered register, in contrast to all other floating point +// instructions. +// For this reason, the defs below accept `FP64,FP64` instead of `FP128,FP64`. +// This is ok since these instructions are not used in code generation. +// If and when code generation is enabled, the code gen variants should be +// split out from this and use the proper register classes, while these should +// remain for the Assembler and Disassembler to remain compliant with the POP. +def MAY : TernaryRXF<"may", 0xED3A, null_frag, FP64, FP64, z_load, 8>; +def MAYR : TernaryRRD<"mayr", 0xB33A, null_frag, FP64, FP64>; + // Division. def DER : BinaryRR <"der", 0x3D, null_frag, FP32, FP32>; def DDR : BinaryRR <"ddr", 0x2D, null_frag, FP64, FP64>; diff --git a/llvm/test/MC/SystemZ/insn-bad.s b/llvm/test/MC/SystemZ/insn-bad.s index 6f94731fa0871..f81278610c73a 100644 --- a/llvm/test/MC/SystemZ/insn-bad.s +++ b/llvm/test/MC/SystemZ/insn-bad.s @@ -4176,12 +4176,9 @@ #CHECK: may %f0, %f0, -1 #CHECK: error: invalid operand #CHECK: may %f0, %f0, 4096 -#CHECK: error: invalid register pair -#CHECK: may %f2, %f0, 0 may %f0, %f0, -1 may %f0, %f0, 4096 - may %f2, %f0, 0 #CHECK: error: invalid operand #CHECK: mayh %f0, %f0, -1 @@ -4199,11 +4196,6 @@ mayl %f0, %f0, -1 mayl %f0, %f0, 4096 -#CHECK: error: invalid register pair -#CHECK: mayr %f2, %f0, %f0 - - mayr %f2, %f0, %f0 - #CHECK: error: invalid operand #CHECK: mc -1, 0 #CHECK: error: invalid operand diff --git a/llvm/test/MC/SystemZ/insn-good.s b/llvm/test/MC/SystemZ/insn-good.s index 9fcb8a42cd73c..553c1b281eb4d 100644 --- a/llvm/test/MC/SystemZ/insn-good.s +++ b/llvm/test/MC/SystemZ/insn-good.s @@ -11526,6 +11526,7 @@ #CHECK: may %f0, %f15, 0 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3a] #CHECK: may %f13, %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0xd0,0x3a] #CHECK: may %f13, %f15, 0 # encoding: [0xed,0xf0,0x00,0x00,0xd0,0x3a] +#CHECK: may %f2, %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x20,0x3a] may %f0, %f0, 0 may %f0, %f0, 4095 @@ -11536,6 +11537,7 @@ may %f0, %f15, 0 may %f13, %f0, 0 may %f13, %f15, 0 + may %f2, %f0, 0 #CHECK: mayh %f0, %f0, 0 # encoding: [0xed,0x00,0x00,0x00,0x00,0x3c] #CHECK: mayh %f0, %f0, 4095 # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3c] @@ -11611,6 +11613,7 @@ #CHECK: mayr %f13, %f0, %f0 # encoding: [0xb3,0x3a,0xd0,0x00] #CHECK: mayr %f5, %f8, %f9 # encoding: [0xb3,0x3a,0x50,0x89] #CHECK: mayr %f13, %f15, %f15 # encoding: [0xb3,0x3a,0xd0,0xff] +#CHECK: mayr %f2, %f0, %f0 # encoding: [0xb3,0x3a,0x20,0x00] mayr %f0, %f0, %f0 mayr %f0, %f0, %f15 @@ -11618,6 +11621,7 @@ mayr %f13, %f0, %f0 mayr %f5, %f8, %f9 mayr %f13, %f15, %f15 + mayr %f2, %f0, %f0 #CHECK: mc 0, 0 # encoding: [0xaf,0x00,0x00,0x00] #CHECK: mc 4095, 0 # encoding: [0xaf,0x00,0x0f,0xff] From 574dbe3e9cdaf94b390015a53b76f87bdaf68aae Mon Sep 17 00:00:00 2001 From: Christopher Di Bella Date: Thu, 18 Jul 2024 10:03:55 -0700 Subject: [PATCH 051/486] suppresses unused variable warning (#99526) --- llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index b9cf36a07846c..0a6ce6a135817 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -3361,7 +3361,7 @@ void InstrRefBasedLDV::buildVLocValueMap( continue; if (BlockLiveIn->Kind == DbgValue::VPHI) BlockLiveIn->Kind = DbgValue::Def; - auto &[Var, DILoc] = DVMap.lookupDVID(VarID); + [[maybe_unused]] auto &[Var, DILoc] = DVMap.lookupDVID(VarID); assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() == Var.getFragment() && "Fragment info missing during value prop"); From 74e51e3efe1d4c79c1b7914c3ead19832e8cc1fb Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Thu, 18 Jul 2024 10:00:45 -0700 Subject: [PATCH 052/486] Move the test to the correct folder. A test specified for a target should remain in its designated folder. --- llvm/test/Transforms/SLPVectorizer/{ => X86}/revec-fix-99411.ll | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/Transforms/SLPVectorizer/{ => X86}/revec-fix-99411.ll (100%) diff --git a/llvm/test/Transforms/SLPVectorizer/revec-fix-99411.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-fix-99411.ll similarity index 100% rename from llvm/test/Transforms/SLPVectorizer/revec-fix-99411.ll rename to llvm/test/Transforms/SLPVectorizer/X86/revec-fix-99411.ll From 13a8f8d51962b59949496c460ea0b8ad22ae908a Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 19 Jul 2024 01:08:51 +0800 Subject: [PATCH 053/486] [InferAttrs] Set attributes for `remainder` (#99521) Fixes one of the issues in https://github.com/llvm/llvm-project/issues/99497. --- llvm/lib/Transforms/Utils/BuildLibCalls.cpp | 3 +++ llvm/test/Transforms/InferFunctionAttrs/annotate.ll | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index e97506b4bbd95..1ced3657184d4 100644 --- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -1195,6 +1195,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F, case LibFunc_pow: case LibFunc_powf: case LibFunc_powl: + case LibFunc_remainder: + case LibFunc_remainderf: + case LibFunc_remainderl: case LibFunc_rint: case LibFunc_rintf: case LibFunc_rintl: diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll index 456155d7e4437..0944402a91fd0 100644 --- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll +++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll @@ -800,6 +800,15 @@ declare ptr @vec_realloc(ptr, i64) ; CHECK: declare noundef ptr @realpath(ptr nocapture noundef readonly, ptr noundef) [[NOFREE_NOUNWIND]] declare ptr @realpath(ptr, ptr) +; CHECK: declare double @remainder(double, double) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare double @remainder(double, double) + +; CHECK: declare float @remainderf(float, float) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare float @remainderf(float, float) + +; CHECK: declare x86_fp80 @remainderl(x86_fp80, x86_fp80) [[NOFREE_NOUNWIND_WILLRETURN_WRITEONLY]] +declare x86_fp80 @remainderl(x86_fp80, x86_fp80) + ; CHECK: declare noundef i32 @remove(ptr nocapture noundef readonly) [[NOFREE_NOUNWIND]] declare i32 @remove(ptr) From 1c55586e9a475a09b7d769e7fc9a254e7150c972 Mon Sep 17 00:00:00 2001 From: Cyndy Ishida Date: Thu, 18 Jul 2024 09:07:28 -0700 Subject: [PATCH 054/486] [clang] Fix typo in comments --- clang/lib/Lex/Preprocessor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 44b69a58f3411..63e27e62cffc8 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -988,7 +988,7 @@ void Preprocessor::LexTokensUntilEOF(std::vector *Tokens) { } /// Lex a header-name token (including one formed from header-name-tokens if -/// \p AllowConcatenation is \c true). +/// \p AllowMacroExpansion is \c true). /// /// \param FilenameTok Filled in with the next token. On success, this will /// be either a header_name token. On failure, it will be whatever other From ce8c43fe274f3f090cad2342af6032176efb846f Mon Sep 17 00:00:00 2001 From: Lei Wang Date: Thu, 18 Jul 2024 10:16:44 -0700 Subject: [PATCH 055/486] Fix assertion of null pointer samples in inline replay mode (#99378) Fix https://github.com/llvm/llvm-project/issues/97108. In inline replay mode, `CalleeSamples` may be null and the order doesn't matter. --- llvm/lib/Transforms/IPO/SampleProfile.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 5cc2911a1a80e..6af284d513efc 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -439,7 +439,10 @@ struct CandidateComparer { const FunctionSamples *LCS = LHS.CalleeSamples; const FunctionSamples *RCS = RHS.CalleeSamples; - assert(LCS && RCS && "Expect non-null FunctionSamples"); + // In inline replay mode, CalleeSamples may be null and the order doesn't + // matter. + if (!LCS || !RCS) + return LCS; // Tie breaker using number of samples try to favor smaller functions first if (LCS->getBodySamples().size() != RCS->getBodySamples().size()) From b2dcf62c514d3c9c143c85bd029d22098b92c38d Mon Sep 17 00:00:00 2001 From: Meredith Julian <35236176+mjulian31@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:29:24 -0700 Subject: [PATCH 056/486] [NVPTX] fix emission for i1 load and extload (#99392) Currently, an illegal 2-byte load from a 1-byte global variable is being generated. This change instead generates a 1-byte load and zero-extends it to i16 register. This was always the intended behavior of the function. In addition, an i1 ext load of any kind needs to be promoted. A missing setLoadExtAction for ISD::EXTLOAD was causing an "Unhandled source type" unreachable due to an illegal i1 ext load during ISelDAGtoDAG (see below bug). Bug https://github.com/llvm/llvm-project/issues/98033. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 8 +++-- llvm/test/CodeGen/NVPTX/i1-ext-load.ll | 34 +++++++++++++++++++++ llvm/test/CodeGen/NVPTX/i1-load-lower.ll | 31 +++++++++++++++++++ 3 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/i1-ext-load.ll create mode 100644 llvm/test/CodeGen/NVPTX/i1-load-lower.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index a2181b478c269..bc23998455a68 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -629,6 +629,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); setTruncStoreAction(VT, MVT::i1, Expand); } @@ -2920,9 +2921,10 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { assert(LD->getExtensionType() == ISD::NON_EXTLOAD); assert(Node->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); - SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), - LD->getPointerInfo(), LD->getAlign(), - LD->getMemOperand()->getFlags()); + SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), + LD->getBasePtr(), LD->getPointerInfo(), + MVT::i8, LD->getAlign(), + LD->getMemOperand()->getFlags()); SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); // The legalizer (the caller) is expecting two values from the legalized // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll new file mode 100644 index 0000000000000..b775e40470047 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\{\}\(\)]" --version 5 + +; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %} + +target triple = "nvptx-nvidia-cuda" + +define void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { +; CHECK-LABEL: foo( +; CHECK: .reg .b16 %rs<2>; +; CHECK: .reg .b32 %r<4>; +; CHECK: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK: ld.param.u64 %rd1, [foo_param_0]; +; CHECK: ld.param.u64 %rd2, [foo_param_1]; +; CHECK: cvta.to.global.u64 %rd3, %rd2; +; CHECK: cvta.to.global.u64 %rd4, %rd1; +; CHECK: ld.global.nc.u8 %rs1, [%rd4]; +; CHECK: cvt.u32.u8 %r1, %rs1; +; CHECK: add.s32 %r2, %r1, 1; +; CHECK: and.b32 %r3, %r2, 1; +; CHECK: st.global.u32 [%rd3], %r3; +; CHECK: ret; + %ld = load i1, ptr %ptr, align 1 + %zext = zext i1 %ld to i32 + %add = add i32 %zext, 1 + %and = and i32 %add, 1 + store i32 %and, ptr %retval + ret void +} + +!nvvm.annotations = !{!0} + +!0 = !{ptr @foo, !"kernel", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll new file mode 100644 index 0000000000000..d1f99b5724de8 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --function foo --extra_scrub --default-march nvptx64 --filter-out ".*//.*" --filter-out "[\(\)\{\}]" --version 5 + +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} + +target triple = "nvptx-nvidia-cuda" + +@i1g = addrspace(1) global i1 false, align 2 + +define void @foo() { +; CHECK-LABEL: foo( +; CHECK: .reg .pred %p<2>; +; CHECK: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK: ld.global.u8 %rs1, [i1g]; +; CHECK: and.b16 %rs2, %rs1, 1; +; CHECK: setp.eq.b16 %p1, %rs2, 1; +; CHECK: @%p1 bra $L__BB0_2; +; CHECK: mov.u16 %rs3, 1; +; CHECK: st.global.u8 [i1g], %rs3; +; CHECK: ret; + %tmp = load i1, ptr addrspace(1) @i1g, align 2 + br i1 %tmp, label %if.end, label %if.then + +if.then: + store i1 true, ptr addrspace(1) @i1g, align 2 + br label %if.end + +if.end: + ret void +} From 04bcd74df73af6fed16bfd0d6784fc0aec582bc0 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Thu, 18 Jul 2024 19:49:01 +0200 Subject: [PATCH 057/486] Revert "Add source file name for template instantiations in -ftime-trace" (#99534) Reverts llvm/llvm-project#98320 Breaks windows tests: ``` Step 8 (test-build-unified-tree-check-clang-unit) failure: test (failure) ******************** TEST 'Clang-Unit :: Support/./ClangSupportTests.exe/1/3' FAILED ******************** Script(shard): -- GTEST_OUTPUT=json:C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\tools\clang\unittests\Support\.\ClangSupportTests.exe-Clang-Unit-4296-1-3.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=3 GTEST_SHARD_INDEX=1 C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\tools\clang\unittests\Support\.\ClangSupportTests.exe -- Script: -- C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\build\tools\clang\unittests\Support\.\ClangSupportTests.exe --gtest_filter=TimeProfilerTest.TemplateInstantiations -- C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\llvm-project\clang\unittests\Support\TimeProfilerTest.cpp(278): error: Expected equality of these values: R"( Frontend | ParseFunctionDefinition (fooB) | ParseFunctionDefinition (fooMTA) | ParseFunctionDefinition (fooA) | ParseDeclarationOrFunctionDefinition (test.cc:3:5) | | ParseFunctionDefinition (user) | PerformPendingInstantiations | | InstantiateFunction (fooA, ./a.h:7) | | | InstantiateFunction (fooB, ./b.h:3) | | | InstantiateFunction (fooMTA, ./a.h:4) )" Which is: "\nFrontend\n| ParseFunctionDefinition (fooB)\n| ParseFunctionDefinition (fooMTA)\n| ParseFunctionDefinition (fooA)\n| ParseDeclarationOrFunctionDefinition (test.cc:3:5)\n| | ParseFunctionDefinition (user)\n| PerformPendingInstantiations\n| | InstantiateFunction (fooA, ./a.h:7)\n| | | InstantiateFunction (fooB, ./b.h:3)\n| | | InstantiateFunction (fooMTA, ./a.h:4)\n" buildTraceGraph(Json) Which is: "\nFrontend\n| ParseFunctionDefinition (fooB)\n| ParseFunctionDefinition (fooMTA)\n| ParseFunctionDefinition (fooA)\n| ParseDeclarationOrFunctionDefinition (test.cc:3:5)\n| | ParseFunctionDefinition (user)\n| PerformPendingInstantiations\n| | InstantiateFunction (fooA, .\\a.h:7)\n| | | InstantiateFunction (fooB, .\\b.h:3)\n| | | InstantiateFunction (fooMTA, .\\a.h:4)\n" With diff: @@ -7,5 +7,5 @@ | | ParseFunctionDefinition (user) | PerformPendingInstantiations -| | InstantiateFunction (fooA, ./a.h:7) -| | | InstantiateFunction (fooB, ./b.h:3) -| | | InstantiateFunction (fooMTA, ./a.h:4)\n +| | InstantiateFunction (fooA, .\\a.h:7) +| | | InstantiateFunction (fooB, .\\b.h:3) +| | | InstantiateFunction (fooMTA, .\\a.h:4)\n C:\buildbot\as-builder-3\llvm-clang-x86_64-win-fast\llvm-project\clang\unittests\Support\TimeProfilerTest.cpp:278 Expected equality of these values: R"( Frontend | ParseFunctionDefinition (fooB) | ParseFunctionDefinition (fooMTA) | ParseFunctionDefinition (fooA) | ParseDeclarationOrFunctionDefinition (test.cc:3:5) | | ParseFunctionDefinition (user) | PerformPendingInstantiations | | InstantiateFunction (fooA, ./a.h:7) ``` --- a-abfdec1d.o.tmp | 0 clang/docs/ReleaseNotes.rst | 3 - clang/include/clang/Driver/Options.td | 4 - .../include/clang/Frontend/FrontendOptions.h | 8 +- clang/lib/Driver/ToolChains/Clang.cpp | 1 - clang/lib/Sema/SemaTemplateInstantiate.cpp | 11 +- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 11 +- clang/test/Driver/ftime-trace-sections.cpp | 2 +- clang/test/Driver/ftime-trace.cpp | 39 +++--- clang/tools/driver/cc1_main.cpp | 3 +- clang/unittests/Support/TimeProfilerTest.cpp | 121 ++++-------------- llvm/include/llvm/Support/TimeProfiler.h | 23 +--- llvm/lib/Support/TimeProfiler.cpp | 61 ++------- 13 files changed, 64 insertions(+), 223 deletions(-) delete mode 100644 a-abfdec1d.o.tmp diff --git a/a-abfdec1d.o.tmp b/a-abfdec1d.o.tmp deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 971df672b6ca1..e0e86af257a19 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -736,9 +736,6 @@ Improvements to Clang's time-trace - Clang now specifies that using ``auto`` in a lambda parameter is a C++14 extension when appropriate. (`#46059: `_). -- Clang now adds source file infomation for template instantiations as ``event["args"]["filename"]``. This - added behind an option ``-ftime-trace-verbose``. This is expected to increase the size of trace by 2-3 times. - Improvements to Coverage Mapping -------------------------------- diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d3068c1b30a7a..1675e435d210c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3988,10 +3988,6 @@ def ftime_trace_granularity_EQ : Joined<["-"], "ftime-trace-granularity=">, Grou HelpText<"Minimum time granularity (in microseconds) traced by time profiler">, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, MarshallingInfoInt, "500u">; -def ftime_trace_verbose : Joined<["-"], "ftime-trace-verbose">, Group, - HelpText<"Make time trace capture verbose event details (e.g. source filenames). This can increase the size of the output by 2-3 times">, - Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, - MarshallingInfoFlag>; def ftime_trace_EQ : Joined<["-"], "ftime-trace=">, Group, HelpText<"Similar to -ftime-trace. Specify the JSON file or a directory which will contain the JSON file">, Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>, diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h index 8241925c98476..5e5034fe01eb5 100644 --- a/clang/include/clang/Frontend/FrontendOptions.h +++ b/clang/include/clang/Frontend/FrontendOptions.h @@ -580,11 +580,6 @@ class FrontendOptions { /// Minimum time granularity (in microseconds) traced by time profiler. unsigned TimeTraceGranularity; - /// Make time trace capture verbose event details (e.g. source filenames). - /// This can increase the size of the output by 2-3 times. - LLVM_PREFERRED_TYPE(bool) - unsigned TimeTraceVerbose : 1; - /// Path which stores the output files for -ftime-trace std::string TimeTracePath; @@ -606,8 +601,7 @@ class FrontendOptions { EmitSymbolGraph(false), EmitExtensionSymbolGraphs(false), EmitSymbolGraphSymbolLabelsForTesting(false), EmitPrettySymbolGraphs(false), GenReducedBMI(false), - UseClangIRPipeline(false), TimeTraceGranularity(500), - TimeTraceVerbose(false) {} + UseClangIRPipeline(false), TimeTraceGranularity(500) {} /// getInputKindForExtension - Return the appropriate input kind for a file /// extension. For example, "c" would return Language::C. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 6b33301d36401..1fd6fba210042 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6754,7 +6754,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, if (const char *Name = C.getTimeTraceFile(&JA)) { CmdArgs.push_back(Args.MakeArgString("-ftime-trace=" + Twine(Name))); Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_granularity_EQ); - Args.AddLastArg(CmdArgs, options::OPT_ftime_trace_verbose); } if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) { diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 725b62db5e80a..a7bc6749c5852 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -3426,16 +3426,11 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation, return true; llvm::TimeTraceScope TimeScope("InstantiateClass", [&]() { - llvm::TimeTraceMetadata M; - llvm::raw_string_ostream OS(M.Detail); + std::string Name; + llvm::raw_string_ostream OS(Name); Instantiation->getNameForDiagnostic(OS, getPrintingPolicy(), /*Qualified=*/true); - if (llvm::isTimeTraceVerbose()) { - auto Loc = SourceMgr.getExpansionLoc(Instantiation->getLocation()); - M.File = SourceMgr.getFilename(Loc); - M.Line = SourceMgr.getExpansionLineNumber(Loc); - } - return M; + return Name; }); Pattern = PatternDef; diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 4e619f4b491a6..01432301633ed 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -4966,16 +4966,11 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, } llvm::TimeTraceScope TimeScope("InstantiateFunction", [&]() { - llvm::TimeTraceMetadata M; - llvm::raw_string_ostream OS(M.Detail); + std::string Name; + llvm::raw_string_ostream OS(Name); Function->getNameForDiagnostic(OS, getPrintingPolicy(), /*Qualified=*/true); - if (llvm::isTimeTraceVerbose()) { - auto Loc = SourceMgr.getExpansionLoc(Function->getLocation()); - M.File = SourceMgr.getFilename(Loc); - M.Line = SourceMgr.getExpansionLineNumber(Loc); - } - return M; + return Name; }); // If we're performing recursive template instantiation, create our own diff --git a/clang/test/Driver/ftime-trace-sections.cpp b/clang/test/Driver/ftime-trace-sections.cpp index da7109b9d81a6..0c16052bc0c3a 100644 --- a/clang/test/Driver/ftime-trace-sections.cpp +++ b/clang/test/Driver/ftime-trace-sections.cpp @@ -1,5 +1,5 @@ // RUN: rm -rf %t && mkdir %t && cd %t -// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s +// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o out %s // RUN: %python %S/ftime-trace-sections.py < out.json template diff --git a/clang/test/Driver/ftime-trace.cpp b/clang/test/Driver/ftime-trace.cpp index 60c5885704b58..5fe63de915a71 100644 --- a/clang/test/Driver/ftime-trace.cpp +++ b/clang/test/Driver/ftime-trace.cpp @@ -1,18 +1,18 @@ // RUN: rm -rf %t && mkdir -p %t && cd %t -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace -ftime-trace-granularity=0 -o out %s // RUN: cat out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=new-name.json -ftime-trace-granularity=0 -o out %s // RUN: cat new-name.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s // RUN: mkdir dir1 dir2 -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir1 -ftime-trace-granularity=0 -o out %s // RUN: cat dir1/out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s -// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -ftime-trace-verbose -o out %s +// RUN: %clangxx -S -no-canonical-prefixes -ftime-trace=dir2/ -ftime-trace-granularity=0 -o out %s // RUN: cat dir2/out.json \ // RUN: | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \ // RUN: | FileCheck %s @@ -34,33 +34,32 @@ // RUN: mkdir d e f && cp %s d/a.cpp && touch d/b.c /// TODO: Support -fno-integrated-as. -// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1 -// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -fintegrated-as d/a.cpp -o e/a.o 2>&1 | FileCheck %s --check-prefix=COMPILE1 +// COMPILE1: -cc1{{.*}} "-ftime-trace=e/a.json" "-ftime-trace-granularity=0" -// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2 -// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// RUN: %clang -### -c -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=COMPILE2 +// COMPILE2: -cc1{{.*}} "-ftime-trace=f/a.json" "-ftime-trace-granularity=0" +// COMPILE2: -cc1{{.*}} "-ftime-trace=f/b.json" "-ftime-trace-granularity=0" /// -o specifies the link output. Create ${output}-${basename}.json. -// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1 -// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x 2>&1 | FileCheck %s --check-prefix=LINK1 +// LINK1: -cc1{{.*}} "-ftime-trace=e/x-a.json" "-ftime-trace-granularity=0" +// LINK1: -cc1{{.*}} "-ftime-trace=e/x-b.json" "-ftime-trace-granularity=0" /// -dumpdir is f/g, not ending with a path separator. We create f/g${basename}.json. -// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2 -// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// RUN: %clang -### -ftime-trace -ftime-trace-granularity=0 d/a.cpp d/b.c -o e/x -dumpdir f/g 2>&1 | FileCheck %s --check-prefix=LINK2 +// LINK2: -cc1{{.*}} "-ftime-trace=f/ga.json" "-ftime-trace-granularity=0" +// LINK2: -cc1{{.*}} "-ftime-trace=f/gb.json" "-ftime-trace-granularity=0" -// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 -ftime-trace-verbose d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3 -// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" -// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" "-ftime-trace-verbose" +// RUN: %clang -### -ftime-trace=e -ftime-trace-granularity=0 d/a.cpp d/b.c -o f/x -dumpdir f/ 2>&1 | FileCheck %s --check-prefix=LINK3 +// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}a-{{[^.]*}}.json" "-ftime-trace-granularity=0" +// LINK3: -cc1{{.*}} "-ftime-trace=e{{/|\\\\}}b-{{[^.]*}}.json" "-ftime-trace-granularity=0" -// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -ftime-trace-verbose -xassembler d/a.cpp 2>&1 | \ +// RUN: %clang -### -ftime-trace -ftime-trace=e -ftime-trace-granularity=1 -xassembler d/a.cpp 2>&1 | \ // RUN: FileCheck %s --check-prefix=UNUSED // UNUSED: warning: argument unused during compilation: '-ftime-trace' // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace=e' // UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-granularity=1' -// UNUSED-NEXT: warning: argument unused during compilation: '-ftime-trace-verbose' // UNUSED-NOT: warning: template diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp index f5e5fad36573e..c2ccb47a15bc8 100644 --- a/clang/tools/driver/cc1_main.cpp +++ b/clang/tools/driver/cc1_main.cpp @@ -241,8 +241,7 @@ int cc1_main(ArrayRef Argv, const char *Argv0, void *MainAddr) { if (!Clang->getFrontendOpts().TimeTracePath.empty()) { llvm::timeTraceProfilerInitialize( - Clang->getFrontendOpts().TimeTraceGranularity, Argv0, - Clang->getFrontendOpts().TimeTraceVerbose); + Clang->getFrontendOpts().TimeTraceGranularity, Argv0); } // --print-supported-cpus takes priority over the actual compilation. if (Clang->getFrontendOpts().PrintSupportedCPUs) diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp index 96e137508ed94..5f3950ff033f1 100644 --- a/clang/unittests/Support/TimeProfilerTest.cpp +++ b/clang/unittests/Support/TimeProfilerTest.cpp @@ -10,14 +10,11 @@ #include "clang/Frontend/FrontendActions.h" #include "clang/Lex/PreprocessorOptions.h" -#include "llvm/ADT/StringMap.h" #include "llvm/Support/JSON.h" #include "llvm/Support/TimeProfiler.h" -#include "llvm/Support/VirtualFileSystem.h" #include #include "gtest/gtest.h" -#include using namespace clang; using namespace llvm; @@ -26,8 +23,7 @@ namespace { // Should be called before testing. void setupProfiler() { - timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test", - /*TimeTraceVerbose=*/true); + timeTraceProfilerInitialize(/*TimeTraceGranularity=*/0, "test"); } // Should be called after `compileFromString()`. @@ -42,24 +38,14 @@ std::string teardownProfiler() { // Returns true if code compiles successfully. // We only parse AST here. This is enough for constexpr evaluation. -bool compileFromString(StringRef Code, StringRef Standard, StringRef File, - llvm::StringMap Headers = {}) { +bool compileFromString(StringRef Code, StringRef Standard, StringRef FileName) { CompilerInstance Compiler; Compiler.createDiagnostics(); - llvm::IntrusiveRefCntPtr FS( - new llvm::vfs::InMemoryFileSystem()); - FS->addFile(File, 0, MemoryBuffer::getMemBuffer(Code)); - for (const auto &Header : Headers) { - FS->addFile(Header.getKey(), 0, - MemoryBuffer::getMemBuffer(Header.getValue())); - } - llvm::IntrusiveRefCntPtr Files( - new FileManager(FileSystemOptions(), FS)); - Compiler.setFileManager(Files.get()); - auto Invocation = std::make_shared(); - std::vector Args = {Standard.data(), File.data()}; + Invocation->getPreprocessorOpts().addRemappedFile( + FileName, MemoryBuffer::getMemBuffer(Code).release()); + const char *Args[] = {Standard.data(), FileName.data()}; CompilerInvocation::CreateFromArgs(*Invocation, Args, Compiler.getDiagnostics()); Compiler.setInvocation(std::move(Invocation)); @@ -74,27 +60,13 @@ bool compileFromString(StringRef Code, StringRef Standard, StringRef File, return Compiler.ExecuteAction(Action); } -std::string GetMetadata(json::Object *Event) { - std::string Metadata; - llvm::raw_string_ostream OS(Metadata); - if (json::Object *Args = Event->getObject("args")) { - if (auto Detail = Args->getString("detail")) - OS << Detail->str(); - if (auto File = Args->getString("file")) - OS << ", " << File->str(); - if (auto Line = Args->getInteger("line")) - OS << ":" << *Line; - } - return Metadata; -} - // Returns pretty-printed trace graph. std::string buildTraceGraph(StringRef Json) { struct EventRecord { int64_t TimestampBegin; int64_t TimestampEnd; - std::string Name; - std::string Metadata; + StringRef Name; + StringRef Detail; }; std::vector Events; @@ -109,13 +81,10 @@ std::string buildTraceGraph(StringRef Json) { int64_t TimestampBegin = TraceEventObj->getInteger("ts").value_or(0); int64_t TimestampEnd = TimestampBegin + TraceEventObj->getInteger("dur").value_or(0); - std::string Name = TraceEventObj->getString("name").value_or("").str(); - std::string Metadata = GetMetadata(TraceEventObj); - - // Source events are asynchronous events and may not perfectly nest the - // synchronous events. Skip testing them. - if (Name == "Source") - continue; + StringRef Name = TraceEventObj->getString("name").value_or(""); + StringRef Detail = ""; + if (json::Object *Args = TraceEventObj->getObject("args")) + Detail = Args->getString("detail").value_or(""); // This is a "summary" event, like "Total PerformPendingInstantiations", // skip it @@ -123,7 +92,7 @@ std::string buildTraceGraph(StringRef Json) { continue; Events.emplace_back( - EventRecord{TimestampBegin, TimestampEnd, Name, Metadata}); + EventRecord{TimestampBegin, TimestampEnd, Name, Detail}); } // There can be nested events that are very fast, for example: @@ -163,9 +132,9 @@ std::string buildTraceGraph(StringRef Json) { Stream << "| "; } Stream.write(Event.Name.data(), Event.Name.size()); - if (!Event.Metadata.empty()) { + if (!Event.Detail.empty()) { Stream << " ("; - Stream.write(Event.Metadata.data(), Event.Metadata.size()); + Stream.write(Event.Detail.data(), Event.Detail.size()); Stream << ")"; } Stream << "\n"; @@ -176,7 +145,7 @@ std::string buildTraceGraph(StringRef Json) { } // namespace TEST(TimeProfilerTest, ConstantEvaluationCxx20) { - std::string Code = R"( + constexpr StringRef Code = R"( void print(double value); namespace slow_namespace { @@ -206,7 +175,8 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line setupProfiler(); ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc")); std::string Json = teardownProfiler(); - ASSERT_EQ(R"( + std::string TraceGraph = buildTraceGraph(Json); + ASSERT_TRUE(TraceGraph == R"( Frontend | ParseDeclarationOrFunctionDefinition (test.cc:2:1) | ParseDeclarationOrFunctionDefinition (test.cc:6:1) @@ -232,54 +202,14 @@ Frontend | ParseDeclarationOrFunctionDefinition (test.cc:25:1) | | EvaluateAsInitializer (slow_init_list) | PerformPendingInstantiations -)", - buildTraceGraph(Json)); -} - -TEST(TimeProfilerTest, TemplateInstantiations) { - std::string B_H = R"( - template - T fooB(T t) { - return T(); - } +)"); - #define MacroTemp(x) template void foo##x(T) { T(); } - )"; - - std::string A_H = R"( - #include "b.h" - - MacroTemp(MTA) - - template - void fooA(T t) { fooB(t); fooMTA(t); } - )"; - std::string Code = R"( - #include "a.h" - void user() { fooA(0); } - )"; - - setupProfiler(); - ASSERT_TRUE(compileFromString(Code, "-std=c++20", "test.cc", - /*Headers=*/{{"a.h", A_H}, {"b.h", B_H}})); - std::string Json = teardownProfiler(); - ASSERT_EQ(R"( -Frontend -| ParseFunctionDefinition (fooB) -| ParseFunctionDefinition (fooMTA) -| ParseFunctionDefinition (fooA) -| ParseDeclarationOrFunctionDefinition (test.cc:3:5) -| | ParseFunctionDefinition (user) -| PerformPendingInstantiations -| | InstantiateFunction (fooA, ./a.h:7) -| | | InstantiateFunction (fooB, ./b.h:3) -| | | InstantiateFunction (fooMTA, ./a.h:4) -)", - buildTraceGraph(Json)); + // NOTE: If this test is failing, run this test with + // `llvm::errs() << TraceGraph;` and change the assert above. } TEST(TimeProfilerTest, ConstantEvaluationC99) { - std::string Code = R"( + constexpr StringRef Code = R"( struct { short quantval[4]; // 3rd line } value; @@ -288,12 +218,15 @@ struct { setupProfiler(); ASSERT_TRUE(compileFromString(Code, "-std=c99", "test.c")); std::string Json = teardownProfiler(); - ASSERT_EQ(R"( + std::string TraceGraph = buildTraceGraph(Json); + ASSERT_TRUE(TraceGraph == R"( Frontend | ParseDeclarationOrFunctionDefinition (test.c:2:1) | | isIntegerConstantExpr () | | EvaluateKnownConstIntCheckOverflow () | PerformPendingInstantiations -)", - buildTraceGraph(Json)); +)"); + + // NOTE: If this test is failing, run this test with + // `llvm::errs() << TraceGraph;` and change the assert above. } diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h index 6eb92930b36fd..31f7df10916db 100644 --- a/llvm/include/llvm/Support/TimeProfiler.h +++ b/llvm/include/llvm/Support/TimeProfiler.h @@ -83,28 +83,16 @@ namespace llvm { class raw_pwrite_stream; -struct TimeTraceMetadata { - std::string Detail; - // Source file and line number information for the event. - std::string File; - int Line; - - bool isEmpty() const { return Detail.empty() && File.empty(); } -}; - struct TimeTraceProfiler; TimeTraceProfiler *getTimeTraceProfilerInstance(); -bool isTimeTraceVerbose(); - struct TimeTraceProfilerEntry; /// Initialize the time trace profiler. /// This sets up the global \p TimeTraceProfilerInstance /// variable to be the profiler instance. void timeTraceProfilerInitialize(unsigned TimeTraceGranularity, - StringRef ProcName, - bool TimeTraceVerbose = false); + StringRef ProcName); /// Cleanup the time trace profiler, if it was initialized. void timeTraceProfilerCleanup(); @@ -140,10 +128,6 @@ TimeTraceProfilerEntry * timeTraceProfilerBegin(StringRef Name, llvm::function_ref Detail); -TimeTraceProfilerEntry * -timeTraceProfilerBegin(StringRef Name, - llvm::function_ref MetaData); - /// Manually begin a time section, with the given \p Name and \p Detail. /// This starts Async Events having \p Name as a category which is shown /// separately from other traces. See @@ -180,11 +164,6 @@ class TimeTraceScope { if (getTimeTraceProfilerInstance() != nullptr) Entry = timeTraceProfilerBegin(Name, Detail); } - TimeTraceScope(StringRef Name, - llvm::function_ref Metadata) { - if (getTimeTraceProfilerInstance() != nullptr) - Entry = timeTraceProfilerBegin(Name, Metadata); - } ~TimeTraceScope() { if (getTimeTraceProfilerInstance() != nullptr) timeTraceProfilerEnd(Entry); diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp index c2014028ddadc..9612db7d30f98 100644 --- a/llvm/lib/Support/TimeProfiler.cpp +++ b/llvm/lib/Support/TimeProfiler.cpp @@ -73,20 +73,12 @@ struct llvm::TimeTraceProfilerEntry { const TimePointType Start; TimePointType End; const std::string Name; - TimeTraceMetadata Metadata; - + const std::string Detail; const bool AsyncEvent = false; TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt, bool Ae) - : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), Metadata(), - AsyncEvent(Ae) { - Metadata.Detail = std::move(Dt); - } - - TimeTraceProfilerEntry(TimePointType &&S, TimePointType &&E, std::string &&N, - TimeTraceMetadata &&Mt, bool Ae) : Start(std::move(S)), End(std::move(E)), Name(std::move(N)), - Metadata(std::move(Mt)), AsyncEvent(Ae) {} + Detail(std::move(Dt)), AsyncEvent(Ae) {} // Calculate timings for FlameGraph. Cast time points to microsecond precision // rather than casting duration. This avoids truncation issues causing inner @@ -105,12 +97,10 @@ struct llvm::TimeTraceProfilerEntry { }; struct llvm::TimeTraceProfiler { - TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "", - bool TimeTraceVerbose = false) + TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "") : BeginningOfTime(system_clock::now()), StartTime(ClockType::now()), ProcName(ProcName), Pid(sys::Process::getProcessId()), - Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity), - TimeTraceVerbose(TimeTraceVerbose) { + Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) { llvm::get_thread_name(ThreadName); } @@ -123,15 +113,6 @@ struct llvm::TimeTraceProfiler { return Stack.back().get(); } - TimeTraceProfilerEntry * - begin(std::string Name, llvm::function_ref Metadata, - bool AsyncEvent = false) { - Stack.emplace_back(std::make_unique( - ClockType::now(), TimePointType(), std::move(Name), Metadata(), - AsyncEvent)); - return Stack.back().get(); - } - void end() { assert(!Stack.empty() && "Must call begin() first"); end(*Stack.back()); @@ -203,15 +184,8 @@ struct llvm::TimeTraceProfiler { J.attribute("dur", DurUs); } J.attribute("name", E.Name); - if (!E.Metadata.isEmpty()) { - J.attributeObject("args", [&] { - if (!E.Metadata.Detail.empty()) - J.attribute("detail", E.Metadata.Detail); - if (!E.Metadata.File.empty()) - J.attribute("file", E.Metadata.File); - if (E.Metadata.Line > 0) - J.attribute("line", E.Metadata.Line); - }); + if (!E.Detail.empty()) { + J.attributeObject("args", [&] { J.attribute("detail", E.Detail); }); } }); @@ -333,25 +307,14 @@ struct llvm::TimeTraceProfiler { // Minimum time granularity (in microseconds) const unsigned TimeTraceGranularity; - - // Make time trace capture verbose event details (e.g. source filenames). This - // can increase the size of the output by 2-3 times. - const bool TimeTraceVerbose; }; -bool llvm::isTimeTraceVerbose() { - return getTimeTraceProfilerInstance() && - getTimeTraceProfilerInstance()->TimeTraceVerbose; -} - void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity, - StringRef ProcName, - bool TimeTraceVerbose) { + StringRef ProcName) { assert(TimeTraceProfilerInstance == nullptr && "Profiler should not be initialized"); TimeTraceProfilerInstance = new TimeTraceProfiler( - TimeTraceGranularity, llvm::sys::path::filename(ProcName), - TimeTraceVerbose); + TimeTraceGranularity, llvm::sys::path::filename(ProcName)); } // Removes all TimeTraceProfilerInstances. @@ -418,14 +381,6 @@ llvm::timeTraceProfilerBegin(StringRef Name, return nullptr; } -TimeTraceProfilerEntry * -llvm::timeTraceProfilerBegin(StringRef Name, - llvm::function_ref Metadata) { - if (TimeTraceProfilerInstance != nullptr) - return TimeTraceProfilerInstance->begin(std::string(Name), Metadata, false); - return nullptr; -} - TimeTraceProfilerEntry *llvm::timeTraceAsyncProfilerBegin(StringRef Name, StringRef Detail) { if (TimeTraceProfilerInstance != nullptr) From 9fb049c8c6a77026fa75a8d36b386a7f5a60613a Mon Sep 17 00:00:00 2001 From: OverMighty Date: Thu, 18 Jul 2024 19:50:49 +0200 Subject: [PATCH 058/486] [libc][math][c23] Add {f,d}mul{l,f128} and f16mul{,f,l,f128} C23 math functions (#98972) Part of #93566. Fixes #94833. --- libc/config/linux/aarch64/entrypoints.txt | 4 + libc/config/linux/x86_64/entrypoints.txt | 8 + libc/docs/math/index.rst | 6 +- libc/spec/llvm_libc_ext.td | 8 + libc/spec/stdc.td | 9 +- .../__support/FPUtil/generic/CMakeLists.txt | 17 ++ libc/src/__support/FPUtil/generic/mul.h | 105 +++++++++++ libc/src/math/CMakeLists.txt | 10 + libc/src/math/dmulf128.h | 21 +++ libc/src/math/dmull.h | 20 ++ libc/src/math/f16mul.h | 21 +++ libc/src/math/f16mulf.h | 21 +++ libc/src/math/f16mulf128.h | 21 +++ libc/src/math/f16mull.h | 21 +++ libc/src/math/fmulf128.h | 21 +++ libc/src/math/fmull.h | 20 ++ libc/src/math/generic/CMakeLists.txt | 108 ++++++++++- libc/src/math/generic/dmulf128.cpp | 20 ++ libc/src/math/generic/dmull.cpp | 20 ++ libc/src/math/generic/f16mul.cpp | 20 ++ libc/src/math/generic/f16mulf.cpp | 20 ++ libc/src/math/generic/f16mulf128.cpp | 20 ++ libc/src/math/generic/f16mull.cpp | 20 ++ libc/src/math/generic/fmul.cpp | 115 +----------- libc/src/math/generic/fmulf128.cpp | 20 ++ libc/src/math/generic/fmull.cpp | 20 ++ libc/test/src/math/CMakeLists.txt | 80 +++++++- libc/test/src/math/FMulTest.h | 121 ------------- libc/test/src/math/MulTest.h | 95 ++++++++++ libc/test/src/math/dmull_test.cpp | 13 ++ libc/test/src/math/f16mul_test.cpp | 13 ++ libc/test/src/math/f16mulf_test.cpp | 13 ++ libc/test/src/math/f16mull_test.cpp | 13 ++ libc/test/src/math/fmul_test.cpp | 8 +- libc/test/src/math/fmull_test.cpp | 13 ++ libc/test/src/math/smoke/CMakeLists.txt | 96 +++++++++- libc/test/src/math/smoke/FMulTest.h | 104 ----------- libc/test/src/math/smoke/MulTest.h | 171 ++++++++++++++++++ libc/test/src/math/smoke/dmulf128_test.cpp | 13 ++ libc/test/src/math/smoke/dmull_test.cpp | 13 ++ libc/test/src/math/smoke/f16mul_test.cpp | 13 ++ libc/test/src/math/smoke/f16mulf128_test.cpp | 13 ++ libc/test/src/math/smoke/f16mulf_test.cpp | 13 ++ libc/test/src/math/smoke/f16mull_test.cpp | 13 ++ libc/test/src/math/smoke/fmul_test.cpp | 8 +- libc/test/src/math/smoke/fmulf128_test.cpp | 13 ++ libc/test/src/math/smoke/fmull_test.cpp | 13 ++ libc/utils/MPFRWrapper/MPFRUtils.cpp | 21 ++- libc/utils/MPFRWrapper/MPFRUtils.h | 2 +- 49 files changed, 1227 insertions(+), 364 deletions(-) create mode 100644 libc/src/__support/FPUtil/generic/mul.h create mode 100644 libc/src/math/dmulf128.h create mode 100644 libc/src/math/dmull.h create mode 100644 libc/src/math/f16mul.h create mode 100644 libc/src/math/f16mulf.h create mode 100644 libc/src/math/f16mulf128.h create mode 100644 libc/src/math/f16mull.h create mode 100644 libc/src/math/fmulf128.h create mode 100644 libc/src/math/fmull.h create mode 100644 libc/src/math/generic/dmulf128.cpp create mode 100644 libc/src/math/generic/dmull.cpp create mode 100644 libc/src/math/generic/f16mul.cpp create mode 100644 libc/src/math/generic/f16mulf.cpp create mode 100644 libc/src/math/generic/f16mulf128.cpp create mode 100644 libc/src/math/generic/f16mull.cpp create mode 100644 libc/src/math/generic/fmulf128.cpp create mode 100644 libc/src/math/generic/fmull.cpp delete mode 100644 libc/test/src/math/FMulTest.h create mode 100644 libc/test/src/math/MulTest.h create mode 100644 libc/test/src/math/dmull_test.cpp create mode 100644 libc/test/src/math/f16mul_test.cpp create mode 100644 libc/test/src/math/f16mulf_test.cpp create mode 100644 libc/test/src/math/f16mull_test.cpp create mode 100644 libc/test/src/math/fmull_test.cpp delete mode 100644 libc/test/src/math/smoke/FMulTest.h create mode 100644 libc/test/src/math/smoke/MulTest.h create mode 100644 libc/test/src/math/smoke/dmulf128_test.cpp create mode 100644 libc/test/src/math/smoke/dmull_test.cpp create mode 100644 libc/test/src/math/smoke/f16mul_test.cpp create mode 100644 libc/test/src/math/smoke/f16mulf128_test.cpp create mode 100644 libc/test/src/math/smoke/f16mulf_test.cpp create mode 100644 libc/test/src/math/smoke/f16mull_test.cpp create mode 100644 libc/test/src/math/smoke/fmulf128_test.cpp create mode 100644 libc/test/src/math/smoke/fmull_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 9b718c3f81151..208889ba34a59 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -357,6 +357,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.cosf libc.src.math.coshf libc.src.math.cospif + libc.src.math.dmull libc.src.math.erff libc.src.math.exp libc.src.math.exp10 @@ -411,6 +412,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fmodf libc.src.math.fmodl libc.src.math.fmul + libc.src.math.fmull libc.src.math.frexp libc.src.math.frexpf libc.src.math.frexpl @@ -531,6 +533,8 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.f16div libc.src.math.f16divf libc.src.math.f16fmaf + libc.src.math.f16mul + libc.src.math.f16mulf libc.src.math.f16sqrt libc.src.math.f16sqrtf libc.src.math.f16sub diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 4d19a28f4a2b3..cbdee084aa199 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -382,6 +382,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.cosf libc.src.math.coshf libc.src.math.cospif + libc.src.math.dmull libc.src.math.erff libc.src.math.exp libc.src.math.exp10 @@ -437,6 +438,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fmodf libc.src.math.fmodl libc.src.math.fmul + libc.src.math.fmull libc.src.math.frexp libc.src.math.frexpf libc.src.math.frexpl @@ -561,6 +563,9 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.f16fma libc.src.math.f16fmaf libc.src.math.f16fmal + libc.src.math.f16mul + libc.src.math.f16mulf + libc.src.math.f16mull libc.src.math.f16sqrt libc.src.math.f16sqrtf libc.src.math.f16sqrtl @@ -622,6 +627,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.f16addf128 libc.src.math.f16divf128 libc.src.math.f16fmaf128 + libc.src.math.f16mulf128 libc.src.math.f16sqrtf128 libc.src.math.f16subf128 ) @@ -634,6 +640,7 @@ if(LIBC_TYPES_HAS_FLOAT128) libc.src.math.canonicalizef128 libc.src.math.ceilf128 libc.src.math.copysignf128 + libc.src.math.dmulf128 libc.src.math.fabsf128 libc.src.math.fdimf128 libc.src.math.floorf128 @@ -648,6 +655,7 @@ if(LIBC_TYPES_HAS_FLOAT128) libc.src.math.fminimum_numf128 libc.src.math.fminimumf128 libc.src.math.fmodf128 + libc.src.math.fmulf128 libc.src.math.frexpf128 libc.src.math.fromfpf128 libc.src.math.fromfpxf128 diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index 205d14946535e..5fab9ce4df949 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -120,7 +120,7 @@ Basic Operations +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | dfma | N/A | N/A | | N/A | | 7.12.14.5 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| dmul | N/A | N/A | | N/A | | 7.12.14.3 | F.10.11 | +| dmul | N/A | N/A | |check| | N/A | |check|\* | 7.12.14.3 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | dsub | N/A | N/A | | N/A | | 7.12.14.2 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ @@ -130,6 +130,8 @@ Basic Operations +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | f16fma | |check|\* | |check|\* | |check|\* | N/A | |check| | 7.12.14.5 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ +| f16mul | |check|\* | |check|\* | |check|\* | N/A | |check| | 7.12.14.5 | F.10.11 | ++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | f16sub | |check|\* | |check|\* | |check|\* | N/A | |check| | 7.12.14.2 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | fabs | |check| | |check| | |check| | |check| | |check| | 7.12.7.3 | F.10.4.3 | @@ -166,7 +168,7 @@ Basic Operations +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | fmod | |check| | |check| | |check| | |check| | |check| | 7.12.10.1 | F.10.7.1 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmul | N/A | |check| | | N/A | | 7.12.14.3 | F.10.11 | +| fmul | N/A | |check| | |check| | N/A | |check|\* | 7.12.14.3 | F.10.11 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | frexp | |check| | |check| | |check| | |check| | |check| | 7.12.6.7 | F.10.3.7 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td index 86215029831ca..55b354cae74e0 100644 --- a/libc/spec/llvm_libc_ext.td +++ b/libc/spec/llvm_libc_ext.td @@ -65,6 +65,14 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> { GuardedFunctionSpec<"f16subf", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"f16subl", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"fmulf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, + + GuardedFunctionSpec<"dmulf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, + + GuardedFunctionSpec<"f16mul", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"f16mulf", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"f16mull", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, + GuardedFunctionSpec<"f16div", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"f16divf", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"f16divl", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index a4c6b40b98388..18592e92d330a 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -475,8 +475,6 @@ def StdC : StandardSpec<"stdc"> { GuardedFunctionSpec<"fminimum_mag_numf16", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fminimum_mag_numf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT128">, - FunctionSpec<"fmul", RetValSpec, [ArgSpec, ArgSpec]>, - FunctionSpec<"fma", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, FunctionSpec<"fmaf", RetValSpec, [ArgSpec, ArgSpec, ArgSpec]>, @@ -733,6 +731,13 @@ def StdC : StandardSpec<"stdc"> { GuardedFunctionSpec<"f16subf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">, + FunctionSpec<"fmul", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"fmull", RetValSpec, [ArgSpec, ArgSpec]>, + + FunctionSpec<"dmull", RetValSpec, [ArgSpec, ArgSpec]>, + + GuardedFunctionSpec<"f16mulf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">, + GuardedFunctionSpec<"f16divf128", RetValSpec, [ArgSpec, ArgSpec], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">, GuardedFunctionSpec<"f16sqrtf128", RetValSpec, [ArgSpec], "LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128">, diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt index c73f68723e232..43096aa529fc3 100644 --- a/libc/src/__support/FPUtil/generic/CMakeLists.txt +++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt @@ -84,3 +84,20 @@ add_header_library( libc.src.__support.macros.attributes libc.src.__support.macros.optimization ) + +add_header_library( + mul + HDRS + mul.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.CPP.bit + libc.src.__support.CPP.type_traits + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.macros.attributes + libc.src.__support.macros.optimization +) diff --git a/libc/src/__support/FPUtil/generic/mul.h b/libc/src/__support/FPUtil/generic/mul.h new file mode 100644 index 0000000000000..02fc69c6cb1ba --- /dev/null +++ b/libc/src/__support/FPUtil/generic/mul.h @@ -0,0 +1,105 @@ +//===-- Multiplication of IEEE 754 floating-point numbers -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H +#define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H + +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { +namespace fputil::generic { + +template +LIBC_INLINE cpp::enable_if_t && + cpp::is_floating_point_v && + sizeof(OutType) <= sizeof(InType), + OutType> +mul(InType x, InType y) { + using OutFPBits = FPBits; + using OutStorageType = typename OutFPBits::StorageType; + using InFPBits = FPBits; + using InStorageType = typename InFPBits::StorageType; + // The product of two p-digit numbers is a 2p-digit number. + using DyadicFloat = + DyadicFloat(InFPBits::SIG_LEN))>; + + InFPBits x_bits(x); + InFPBits y_bits(y); + + Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG; + + if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() || + x_bits.is_zero() || y_bits.is_zero())) { + if (x_bits.is_nan() || y_bits.is_nan()) { + if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan()) + raise_except_if_required(FE_INVALID); + + if (x_bits.is_quiet_nan()) { + InStorageType x_payload = static_cast(getpayload(x)); + if ((x_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0) + return OutFPBits::quiet_nan(x_bits.sign(), + static_cast(x_payload)) + .get_val(); + } + + if (y_bits.is_quiet_nan()) { + InStorageType y_payload = static_cast(getpayload(y)); + if ((y_payload & ~(OutFPBits::FRACTION_MASK >> 1)) == 0) + return OutFPBits::quiet_nan(y_bits.sign(), + static_cast(y_payload)) + .get_val(); + } + + return OutFPBits::quiet_nan().get_val(); + } + + if (x_bits.is_inf()) { + if (y_bits.is_zero()) { + set_errno_if_required(EDOM); + raise_except_if_required(FE_INVALID); + return OutFPBits::quiet_nan().get_val(); + } + + return OutFPBits::inf(result_sign).get_val(); + } + + if (y_bits.is_inf()) { + if (x_bits.is_zero()) { + set_errno_if_required(EDOM); + raise_except_if_required(FE_INVALID); + return OutFPBits::quiet_nan().get_val(); + } + + return OutFPBits::inf(result_sign).get_val(); + } + + // Now either x or y is zero, and the other one is finite. + return OutFPBits::zero(result_sign).get_val(); + } + + DyadicFloat xd(x); + DyadicFloat yd(y); + + DyadicFloat result = quick_mul(xd, yd); + return result.template as(); +} + +} // namespace fputil::generic +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_MUL_H diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index dc2339896f2bb..c4e33130e9090 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -86,6 +86,9 @@ add_math_entrypoint_object(cosh) add_math_entrypoint_object(coshf) add_math_entrypoint_object(cospif) +add_math_entrypoint_object(dmull) +add_math_entrypoint_object(dmulf128) + add_math_entrypoint_object(erf) add_math_entrypoint_object(erff) @@ -118,6 +121,11 @@ add_math_entrypoint_object(f16fmaf) add_math_entrypoint_object(f16fmal) add_math_entrypoint_object(f16fmaf128) +add_math_entrypoint_object(f16mul) +add_math_entrypoint_object(f16mulf) +add_math_entrypoint_object(f16mull) +add_math_entrypoint_object(f16mulf128) + add_math_entrypoint_object(f16sqrt) add_math_entrypoint_object(f16sqrtf) add_math_entrypoint_object(f16sqrtl) @@ -210,6 +218,8 @@ add_math_entrypoint_object(fminimum_mag_numf16) add_math_entrypoint_object(fminimum_mag_numf128) add_math_entrypoint_object(fmul) +add_math_entrypoint_object(fmull) +add_math_entrypoint_object(fmulf128) add_math_entrypoint_object(fmod) add_math_entrypoint_object(fmodf) diff --git a/libc/src/math/dmulf128.h b/libc/src/math/dmulf128.h new file mode 100644 index 0000000000000..623f22c910a41 --- /dev/null +++ b/libc/src/math/dmulf128.h @@ -0,0 +1,21 @@ +//===-- Implementation header for dmulf128 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_DMULF128_H +#define LLVM_LIBC_SRC_MATH_DMULF128_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +double dmulf128(float128 x, float128 y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_DMULF128_H diff --git a/libc/src/math/dmull.h b/libc/src/math/dmull.h new file mode 100644 index 0000000000000..656776a603009 --- /dev/null +++ b/libc/src/math/dmull.h @@ -0,0 +1,20 @@ +//===-- Implementation header for dmull -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_DMULL_H +#define LLVM_LIBC_SRC_MATH_DMULL_H + +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +double dmull(long double x, long double y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_DMULL_H diff --git a/libc/src/math/f16mul.h b/libc/src/math/f16mul.h new file mode 100644 index 0000000000000..89403cf219271 --- /dev/null +++ b/libc/src/math/f16mul.h @@ -0,0 +1,21 @@ +//===-- Implementation header for f16mul ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_F16MUL_H +#define LLVM_LIBC_SRC_MATH_F16MUL_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 f16mul(double x, double y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_F16MUL_H diff --git a/libc/src/math/f16mulf.h b/libc/src/math/f16mulf.h new file mode 100644 index 0000000000000..755886d6f14d0 --- /dev/null +++ b/libc/src/math/f16mulf.h @@ -0,0 +1,21 @@ +//===-- Implementation header for f16mulf -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_F16MULF_H +#define LLVM_LIBC_SRC_MATH_F16MULF_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 f16mulf(float x, float y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_F16MULF_H diff --git a/libc/src/math/f16mulf128.h b/libc/src/math/f16mulf128.h new file mode 100644 index 0000000000000..14371c57ca88c --- /dev/null +++ b/libc/src/math/f16mulf128.h @@ -0,0 +1,21 @@ +//===-- Implementation header for f16mulf128 --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_F16MULF128_H +#define LLVM_LIBC_SRC_MATH_F16MULF128_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 f16mulf128(float128 x, float128 y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_F16MULF128_H diff --git a/libc/src/math/f16mull.h b/libc/src/math/f16mull.h new file mode 100644 index 0000000000000..a3177cadc1306 --- /dev/null +++ b/libc/src/math/f16mull.h @@ -0,0 +1,21 @@ +//===-- Implementation header for f16mull -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_F16MULL_H +#define LLVM_LIBC_SRC_MATH_F16MULL_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 f16mull(long double x, long double y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_F16MULL_H diff --git a/libc/src/math/fmulf128.h b/libc/src/math/fmulf128.h new file mode 100644 index 0000000000000..94137ae87eb1e --- /dev/null +++ b/libc/src/math/fmulf128.h @@ -0,0 +1,21 @@ +//===-- Implementation header for fmulf128 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMULF128_H +#define LLVM_LIBC_SRC_MATH_FMULF128_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float fmulf128(float128 x, float128 y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_FMULF128_H diff --git a/libc/src/math/fmull.h b/libc/src/math/fmull.h new file mode 100644 index 0000000000000..46e6c77cc66a2 --- /dev/null +++ b/libc/src/math/fmull.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmull -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMULL_H +#define LLVM_LIBC_SRC_MATH_FMULL_H + +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +float fmull(long double x, long double y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_FMULL_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 415ca3fbce796..9c86bac4a0cb7 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -2633,11 +2633,32 @@ add_entrypoint_object( HDRS ../fmul.h DEPENDS - libc.src.__support.FPUtil.basic_operations - libc.src.__support.uint128 - libc.src.__support.CPP.bit - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.rounding_mode + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + fmull + SRCS + fmull.cpp + HDRS + ../fmull.h + DEPENDS + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + fmulf128 + SRCS + fmulf128.cpp + HDRS + ../fmulf128.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul COMPILE_OPTIONS -O3 ) @@ -4200,3 +4221,80 @@ add_entrypoint_object( libc.src.__support.macros.optimization libc.src.__support.integer_literals ) + +add_entrypoint_object( + dmull + SRCS + dmull.cpp + HDRS + ../dmull.h + DEPENDS + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + dmulf128 + SRCS + dmulf128.cpp + HDRS + ../dmulf128.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + f16mul + SRCS + f16mul.cpp + HDRS + ../f16mul.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + f16mulf + SRCS + f16mulf.cpp + HDRS + ../f16mulf.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + f16mull + SRCS + f16mull.cpp + HDRS + ../f16mull.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( + f16mulf128 + SRCS + f16mulf128.cpp + HDRS + ../f16mulf128.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.generic.mul + COMPILE_OPTIONS + -O3 +) diff --git a/libc/src/math/generic/dmulf128.cpp b/libc/src/math/generic/dmulf128.cpp new file mode 100644 index 0000000000000..7e6ef95362c09 --- /dev/null +++ b/libc/src/math/generic/dmulf128.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of dmulf128 function -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/dmulf128.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(double, dmulf128, (float128 x, float128 y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/dmull.cpp b/libc/src/math/generic/dmull.cpp new file mode 100644 index 0000000000000..428caa84a9977 --- /dev/null +++ b/libc/src/math/generic/dmull.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of dmull function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/dmull.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(double, dmull, (long double x, long double y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/f16mul.cpp b/libc/src/math/generic/f16mul.cpp new file mode 100644 index 0000000000000..f7a5225b60f11 --- /dev/null +++ b/libc/src/math/generic/f16mul.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of f16mul function ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/f16mul.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, f16mul, (double x, double y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/f16mulf.cpp b/libc/src/math/generic/f16mulf.cpp new file mode 100644 index 0000000000000..2c04664f804ea --- /dev/null +++ b/libc/src/math/generic/f16mulf.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of f16mulf function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/f16mulf.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, f16mulf, (float x, float y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/f16mulf128.cpp b/libc/src/math/generic/f16mulf128.cpp new file mode 100644 index 0000000000000..7e2d6a0d194ae --- /dev/null +++ b/libc/src/math/generic/f16mulf128.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of f16mulf128 function -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/f16mulf128.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, f16mulf128, (float128 x, float128 y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/f16mull.cpp b/libc/src/math/generic/f16mull.cpp new file mode 100644 index 0000000000000..fc66fba4d9f23 --- /dev/null +++ b/libc/src/math/generic/f16mull.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of f16mull function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/f16mull.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, f16mull, (long double x, long double y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp index 16fa11e93df09..64c27d6e2f956 100644 --- a/libc/src/math/generic/fmul.cpp +++ b/libc/src/math/generic/fmul.cpp @@ -1,4 +1,4 @@ -//===-- Implementation of fmul function------------------------------------===// +//===-- Implementation of fmul function -----------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,123 +7,14 @@ //===----------------------------------------------------------------------===// #include "src/math/fmul.h" -#include "src/__support/CPP/bit.h" -#include "src/__support/FPUtil/BasicOperations.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/FPUtil/generic/mul.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/__support/uint128.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) { - auto x_bits = fputil::FPBits(x); - - auto y_bits = fputil::FPBits(y); - - auto output_sign = (x_bits.sign() != y_bits.sign()) ? Sign::NEG : Sign::POS; - - if (LIBC_UNLIKELY(x_bits.is_inf_or_nan() || y_bits.is_inf_or_nan() || - x_bits.is_zero() || y_bits.is_zero())) { - if (x_bits.is_nan()) - return static_cast(x); - if (y_bits.is_nan()) - return static_cast(y); - if (x_bits.is_inf()) - return y_bits.is_zero() - ? fputil::FPBits::quiet_nan().get_val() - : fputil::FPBits::inf(output_sign).get_val(); - if (y_bits.is_inf()) - return x_bits.is_zero() - ? fputil::FPBits::quiet_nan().get_val() - : fputil::FPBits::inf(output_sign).get_val(); - // Now either x or y is zero, and the other one is finite. - return fputil::FPBits::zero(output_sign).get_val(); - } - - uint64_t mx, my; - - // Get mantissa and append the hidden bit if needed. - mx = x_bits.get_explicit_mantissa(); - my = y_bits.get_explicit_mantissa(); - - // Get the corresponding biased exponent. - int ex = x_bits.get_explicit_exponent(); - int ey = y_bits.get_explicit_exponent(); - - // Count the number of leading zeros of the explicit mantissas. - int nx = cpp::countl_zero(mx); - int ny = cpp::countl_zero(my); - // Shift the leading 1 bit to the most significant bit. - mx <<= nx; - my <<= ny; - - // Adjust exponent accordingly: If x or y are normal, we will only need to - // shift by (exponent length + sign bit = 11 bits. If x or y are denormal, we - // will need to shift more than 11 bits. - ex -= (nx - 11); - ey -= (ny - 11); - - UInt128 product = static_cast(mx) * static_cast(my); - int32_t dm1; - uint64_t highs, lows; - uint64_t g, hight, lowt; - uint32_t m; - uint32_t b; - int c; - - highs = static_cast(product >> 64); - c = static_cast(highs >= 0x8000000000000000); - lows = static_cast(product); - - lowt = (lows != 0); - - dm1 = ex + ey + c + fputil::FPBits::EXP_BIAS; - - int round_mode = fputil::quick_get_round(); - if (dm1 >= 255) { - if ((round_mode == FE_TOWARDZERO) || - (round_mode == FE_UPWARD && output_sign.is_neg()) || - (round_mode == FE_DOWNWARD && output_sign.is_pos())) { - return fputil::FPBits::max_normal(output_sign).get_val(); - } - return fputil::FPBits::inf().get_val(); - } else if (dm1 <= 0) { - - int m_shift = 40 + c - dm1; - int g_shift = m_shift - 1; - int h_shift = 64 - g_shift; - m = (m_shift >= 64) ? 0 : static_cast(highs >> m_shift); - - g = g_shift >= 64 ? 0 : (highs >> g_shift) & 1; - hight = h_shift >= 64 ? highs : (highs << h_shift) != 0; - - dm1 = 0; - } else { - m = static_cast(highs >> (39 + c)); - g = (highs >> (38 + c)) & 1; - hight = (highs << (26 - c)) != 0; - } - - if (round_mode == FE_TONEAREST) { - b = g && ((hight && lowt) || ((m & 1) != 0)); - } else if ((output_sign.is_neg() && round_mode == FE_DOWNWARD) || - (output_sign.is_pos() && round_mode == FE_UPWARD)) { - b = (g == 0 && (hight && lowt) == 0) ? 0 : 1; - } else { - b = 0; - } - - uint32_t exp16 = (dm1 << 23); - - uint32_t m2 = m & fputil::FPBits::FRACTION_MASK; - - uint32_t result = (exp16 + m2) + b; - - auto result_bits = fputil::FPBits(result); - result_bits.set_sign(output_sign); - return result_bits.get_val(); + return fputil::generic::mul(x, y); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/fmulf128.cpp b/libc/src/math/generic/fmulf128.cpp new file mode 100644 index 0000000000000..c0c55ace641b8 --- /dev/null +++ b/libc/src/math/generic/fmulf128.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of fmulf128 function -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmulf128.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float, fmulf128, (float128 x, float128 y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/fmull.cpp b/libc/src/math/generic/fmull.cpp new file mode 100644 index 0000000000000..41ab165e7d09d --- /dev/null +++ b/libc/src/math/generic/fmull.cpp @@ -0,0 +1,20 @@ +//===-- Implementation of fmull function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmull.h" +#include "src/__support/FPUtil/generic/mul.h" +#include "src/__support/common.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float, fmull, (long double x, long double y)) { + return fputil::generic::mul(x, y); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 64b4d2c58fb6a..c28385f620cfd 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1847,10 +1847,28 @@ add_fp_unittest( SRCS fmul_test.cpp HDRS - FMulTest.h + MulTest.h DEPENDS libc.src.math.fmul + libc.src.stdlib.rand + libc.src.stdlib.srand ) + +add_fp_unittest( + fmull_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + fmull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.src.math.fmull + libc.src.stdlib.rand + libc.src.stdlib.srand +) + add_fp_unittest( asinhf_test NEED_MPFR @@ -2237,6 +2255,66 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + dmull_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + dmull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.src.math.dmull + libc.src.stdlib.rand + libc.src.stdlib.srand +) + +add_fp_unittest( + f16mul_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + f16mul_test.cpp + HDRS + MulTest.h + DEPENDS + libc.src.math.f16mul + libc.src.stdlib.rand + libc.src.stdlib.srand +) + +add_fp_unittest( + f16mulf_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + f16mulf_test.cpp + HDRS + MulTest.h + DEPENDS + libc.src.math.f16mulf + libc.src.stdlib.rand + libc.src.stdlib.srand +) + +add_fp_unittest( + f16mull_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + f16mull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.src.math.f16mull + libc.src.stdlib.rand + libc.src.stdlib.srand +) + add_subdirectory(generic) add_subdirectory(smoke) diff --git a/libc/test/src/math/FMulTest.h b/libc/test/src/math/FMulTest.h deleted file mode 100644 index 8ca33ea71b712..0000000000000 --- a/libc/test/src/math/FMulTest.h +++ /dev/null @@ -1,121 +0,0 @@ -//===-- Utility class to test fmul[f|l] -------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H -#define LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H - -#include "src/__support/FPUtil/FPBits.h" -#include "test/UnitTest/FEnvSafeTest.h" -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" -#include "utils/MPFRWrapper/MPFRUtils.h" - -namespace mpfr = LIBC_NAMESPACE::testing::mpfr; - -template -class FmulMPFRTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { - - DECLARE_SPECIAL_CONSTANTS(InType) - -public: - typedef OutType (*FMulFunc)(InType, InType); - - void testFMulMPFR(FMulFunc func) { - constexpr int N = 10; - mpfr::BinaryInput INPUTS[N] = { - {3.0, 5.0}, - {0x1.0p1, 0x1.0p-131}, - {0x1.0p2, 0x1.0p-129}, - {1.0, 1.0}, - {-0.0, -0.0}, - {-0.0, 0.0}, - {0.0, -0.0}, - {0x1.0p100, 0x1.0p100}, - {1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150}, - {1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150}}; - - for (int i = 0; i < N; ++i) { - InType x = INPUTS[i].x; - InType y = INPUTS[i].y; - ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i], - func(x, y), 0.5); - } - } - - void testSpecialInputsMPFR(FMulFunc func) { - constexpr int N = 27; - mpfr::BinaryInput INPUTS[N] = {{inf, 0x1.0p-129}, - {0x1.0p-129, inf}, - {inf, 2.0}, - {3.0, inf}, - {0.0, 0.0}, - {neg_inf, aNaN}, - {aNaN, neg_inf}, - {neg_inf, neg_inf}, - {0.0, neg_inf}, - {neg_inf, 0.0}, - {neg_inf, 1.0}, - {1.0, neg_inf}, - {neg_inf, 0x1.0p-129}, - {0x1.0p-129, neg_inf}, - {0.0, 0x1.0p-129}, - {inf, 0.0}, - {0.0, inf}, - {0.0, aNaN}, - {2.0, aNaN}, - {0x1.0p-129, aNaN}, - {inf, aNaN}, - {aNaN, aNaN}, - {0.0, sNaN}, - {2.0, sNaN}, - {0x1.0p-129, sNaN}, - {inf, sNaN}, - {sNaN, sNaN}}; - - for (int i = 0; i < N; ++i) { - InType x = INPUTS[i].x; - InType y = INPUTS[i].y; - ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, INPUTS[i], - func(x, y), 0.5); - } - } - - void testNormalRange(FMulFunc func) { - using FPBits = LIBC_NAMESPACE::fputil::FPBits; - using StorageType = typename FPBits::StorageType; - static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval(); - static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval(); - - constexpr StorageType COUNT = 10'001; - constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT; - for (int signs = 0; signs < 4; ++signs) { - for (StorageType v = MIN_NORMAL, w = MAX_NORMAL; - v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) { - InType x = FPBits(v).get_val(), y = FPBits(w).get_val(); - if (signs % 2 == 1) { - x = -x; - } - if (signs >= 2) { - y = -y; - } - - mpfr::BinaryInput input{x, y}; - ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fmul, input, func(x, y), - 0.5); - } - } - } -}; - -#define LIST_FMUL_MPFR_TESTS(OutType, InType, func) \ - using LlvmLibcFmulTest = FmulMPFRTest; \ - TEST_F(LlvmLibcFmulTest, MulMpfr) { testFMulMPFR(&func); } \ - TEST_F(LlvmLibcFmulTest, NanInfMpfr) { testSpecialInputsMPFR(&func); } \ - TEST_F(LlvmLibcFmulTest, NormalRange) { testNormalRange(&func); } - -#endif // LLVM_LIBC_TEST_SRC_MATH_FMULTEST_H diff --git a/libc/test/src/math/MulTest.h b/libc/test/src/math/MulTest.h new file mode 100644 index 0000000000000..cb81a795be36b --- /dev/null +++ b/libc/test/src/math/MulTest.h @@ -0,0 +1,95 @@ +//===-- Utility class to test different flavors of float mul ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_MULTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_MULTEST_H + +#include "src/stdlib/rand.h" +#include "src/stdlib/srand.h" +#include "test/UnitTest/FEnvSafeTest.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +template +class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { + + struct InConstants { + DECLARE_SPECIAL_CONSTANTS(InType) + }; + + using InFPBits = typename InConstants::FPBits; + using InStorageType = typename InConstants::StorageType; + + static constexpr InStorageType IN_MAX_NORMAL_U = + InFPBits::max_normal().uintval(); + static constexpr InStorageType IN_MIN_NORMAL_U = + InFPBits::min_normal().uintval(); + static constexpr InStorageType IN_MAX_SUBNORMAL_U = + InFPBits::max_subnormal().uintval(); + static constexpr InStorageType IN_MIN_SUBNORMAL_U = + InFPBits::min_subnormal().uintval(); + + InStorageType get_random_bit_pattern() { + InStorageType bits{0}; + for (InStorageType i = 0; i < sizeof(InStorageType) / 2; ++i) + bits = (bits << 2) + static_cast(LIBC_NAMESPACE::rand()); + return bits; + } + +public: + using MulFunc = OutType (*)(InType, InType); + + void test_subnormal_range(MulFunc func) { + constexpr InStorageType COUNT = 10'001; + constexpr InStorageType STEP = + (IN_MAX_SUBNORMAL_U - IN_MIN_SUBNORMAL_U) / COUNT; + LIBC_NAMESPACE::srand(1); + for (int signs = 0; signs < 4; signs++) { + for (InStorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) { + InType x = InFPBits(get_random_bit_pattern()).get_val(); + InType y = InFPBits(v).get_val(); + if ((signs & 1) != 0) + x = -x; + if ((signs & 2) != 0) + y = -y; + mpfr::BinaryInput input{x, y}; + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input, func(x, y), + 0.5); + } + } + } + + void test_normal_range(MulFunc func) { + constexpr InStorageType COUNT = 10'001; + constexpr InStorageType STEP = (IN_MAX_NORMAL_U - IN_MIN_NORMAL_U) / COUNT; + LIBC_NAMESPACE::srand(1); + for (int signs = 0; signs < 4; signs++) { + for (InStorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) { + InType x = InFPBits(get_random_bit_pattern()).get_val(); + InType y = InFPBits(v).get_val(); + if ((signs & 1) != 0) + x = -x; + if ((signs & 2) != 0) + y = -y; + mpfr::BinaryInput input{x, y}; + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input, func(x, y), + 0.5); + } + } + } +}; + +#define LIST_MUL_TESTS(OutType, InType, func) \ + using LlvmLibcMulTest = MulTest; \ + TEST_F(LlvmLibcMulTest, SubnormalRange) { test_subnormal_range(&func); } \ + TEST_F(LlvmLibcMulTest, NormalRange) { test_normal_range(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_MULTEST_H diff --git a/libc/test/src/math/dmull_test.cpp b/libc/test/src/math/dmull_test.cpp new file mode 100644 index 0000000000000..1b9c9c2c24ed3 --- /dev/null +++ b/libc/test/src/math/dmull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for dmull -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/dmull.h" + +LIST_MUL_TESTS(double, long double, LIBC_NAMESPACE::dmull) diff --git a/libc/test/src/math/f16mul_test.cpp b/libc/test/src/math/f16mul_test.cpp new file mode 100644 index 0000000000000..49b443870c483 --- /dev/null +++ b/libc/test/src/math/f16mul_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mul ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mul.h" + +LIST_MUL_TESTS(float16, double, LIBC_NAMESPACE::f16mul) diff --git a/libc/test/src/math/f16mulf_test.cpp b/libc/test/src/math/f16mulf_test.cpp new file mode 100644 index 0000000000000..bf2530863621d --- /dev/null +++ b/libc/test/src/math/f16mulf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mulf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mulf.h" + +LIST_MUL_TESTS(float16, float, LIBC_NAMESPACE::f16mulf) diff --git a/libc/test/src/math/f16mull_test.cpp b/libc/test/src/math/f16mull_test.cpp new file mode 100644 index 0000000000000..5292ddb87b7f4 --- /dev/null +++ b/libc/test/src/math/f16mull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mull ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mull.h" + +LIST_MUL_TESTS(float16, long double, LIBC_NAMESPACE::f16mull) diff --git a/libc/test/src/math/fmul_test.cpp b/libc/test/src/math/fmul_test.cpp index 16eaa1a818daf..3f6df66456bac 100644 --- a/libc/test/src/math/fmul_test.cpp +++ b/libc/test/src/math/fmul_test.cpp @@ -1,13 +1,13 @@ -//===-- Unittests for fmul-------------------------------------------------===// +//===-- Unittests for fmul ------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -//===---------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// -#include "FMulTest.h" +#include "MulTest.h" #include "src/math/fmul.h" -LIST_FMUL_MPFR_TESTS(float, double, LIBC_NAMESPACE::fmul) +LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul) diff --git a/libc/test/src/math/fmull_test.cpp b/libc/test/src/math/fmull_test.cpp new file mode 100644 index 0000000000000..ef694063f20dc --- /dev/null +++ b/libc/test/src/math/fmull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmull -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/fmull.h" + +LIST_MUL_TESTS(float, long double, LIBC_NAMESPACE::fmull) diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 76d5919ad9156..c57aa9638ed30 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -2531,10 +2531,27 @@ add_fp_unittest( SRCS fmul_test.cpp HDRS - FMulTest.h + MulTest.h DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations libc.src.math.fmul - libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmull_test + SUITE + libc-math-smoke-tests + SRCS + fmull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.fmull ) add_fp_unittest( @@ -3981,3 +3998,78 @@ add_fp_unittest( DEPENDS libc.src.math.cbrt ) + +add_fp_unittest( + dmull_test + SUITE + libc-math-smoke-tests + SRCS + dmull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.dmull +) + +add_fp_unittest( + f16mul_test + SUITE + libc-math-smoke-tests + SRCS + f16mul_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.f16mul +) + +add_fp_unittest( + f16mulf_test + SUITE + libc-math-smoke-tests + SRCS + f16mulf_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.f16mulf +) + +add_fp_unittest( + f16mull_test + SUITE + libc-math-smoke-tests + SRCS + f16mull_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.f16mull +) + +add_fp_unittest( + f16mulf128_test + SUITE + libc-math-smoke-tests + SRCS + f16mulf128_test.cpp + HDRS + MulTest.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.basic_operations + libc.src.math.f16mulf128 +) diff --git a/libc/test/src/math/smoke/FMulTest.h b/libc/test/src/math/smoke/FMulTest.h deleted file mode 100644 index 33fb82c8d2da1..0000000000000 --- a/libc/test/src/math/smoke/FMulTest.h +++ /dev/null @@ -1,104 +0,0 @@ -//===-- Utility class to test fmul[f|l] ---------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H -#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H - -#include "test/UnitTest/FEnvSafeTest.h" -#include "test/UnitTest/FPMatcher.h" -#include "test/UnitTest/Test.h" - -template -class FmulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { - - DECLARE_SPECIAL_CONSTANTS(T) - -public: - typedef T (*FMulFunc)(R, R); - - void testMul(FMulFunc func) { - - EXPECT_FP_EQ_ALL_ROUNDING(T(15.0), func(3.0, 5.0)); - EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-130), func(0x1.0p1, 0x1.0p-131)); - EXPECT_FP_EQ_ALL_ROUNDING(T(0x1.0p-127), func(0x1.0p2, 0x1.0p-129)); - EXPECT_FP_EQ_ALL_ROUNDING(T(1.0), func(1.0, 1.0)); - - EXPECT_FP_EQ_ALL_ROUNDING(T(0.0), func(-0.0, -0.0)); - EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(0.0, -0.0)); - EXPECT_FP_EQ_ALL_ROUNDING(T(-0.0), func(-0.0, 0.0)); - - EXPECT_FP_EQ_ROUNDING_NEAREST(inf, func(0x1.0p100, 0x1.0p100)); - EXPECT_FP_EQ_ROUNDING_UPWARD(inf, func(0x1.0p100, 0x1.0p100)); - EXPECT_FP_EQ_ROUNDING_DOWNWARD(max_normal, func(0x1.0p100, 0x1.0p100)); - EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(max_normal, func(0x1.0p100, 0x1.0p100)); - - EXPECT_FP_EQ_ROUNDING_NEAREST( - 0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_DOWNWARD( - 0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO( - 0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_UPWARD( - 0x1p0, func(1.0, 1.0 + 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - - EXPECT_FP_EQ_ROUNDING_NEAREST( - 0x1.0p-128f + 0x1.0p-148f, - func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_UPWARD( - 0x1.0p-128f + 0x1.0p-148f, - func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_DOWNWARD( - 0x1.0p-128f + 0x1.0p-149f, - func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO( - 0x1.0p-128f + 0x1.0p-149f, - func(1.0, 0x1.0p-128 + 0x1.0p-149 + 0x1.0p-150)); - } - - void testSpecialInputs(FMulFunc func) { - EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 0x1.0p-129)); - EXPECT_FP_EQ_ALL_ROUNDING(inf, func(0x1.0p-129, inf)); - EXPECT_FP_EQ_ALL_ROUNDING(inf, func(inf, 2.0)); - EXPECT_FP_EQ_ALL_ROUNDING(inf, func(3.0, inf)); - EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0.0)); - - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, neg_inf)); - EXPECT_FP_EQ_ALL_ROUNDING(inf, func(neg_inf, neg_inf)); - - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, neg_inf)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(neg_inf, 0.0)); - - EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 1.0)); - EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(1.0, neg_inf)); - - EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(neg_inf, 0x1.0p-129)); - EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, func(0x1.0p-129, neg_inf)); - - EXPECT_FP_EQ_ALL_ROUNDING(0.0, func(0.0, 0x1.0p-129)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, 0.0)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, inf)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(aNaN, aNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0.0, sNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(2.0, sNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(0x1.0p-129, sNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(inf, sNaN)); - EXPECT_FP_EQ_ALL_ROUNDING(aNaN, func(sNaN, sNaN)); - } -}; - -#define LIST_FMUL_TESTS(T, R, func) \ - using LlvmLibcFmulTest = FmulTest; \ - TEST_F(LlvmLibcFmulTest, Mul) { testMul(&func); } \ - TEST_F(LlvmLibcFmulTest, NaNInf) { testSpecialInputs(&func); } - -#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMULTEST_H diff --git a/libc/test/src/math/smoke/MulTest.h b/libc/test/src/math/smoke/MulTest.h new file mode 100644 index 0000000000000..e2298eaeeb216 --- /dev/null +++ b/libc/test/src/math/smoke/MulTest.h @@ -0,0 +1,171 @@ +//===-- Utility class to test different flavors of float mul ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H +#define LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H + +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "test/UnitTest/FEnvSafeTest.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +template +class MulTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { + + DECLARE_SPECIAL_CONSTANTS(OutType) + + struct InConstants { + DECLARE_SPECIAL_CONSTANTS(InType) + }; + + using InFPBits = typename InConstants::FPBits; + using InStorageType = typename InConstants::StorageType; + + InConstants in; + +public: + using MulFunc = OutType (*)(InType, InType); + + void test_special_numbers(MulFunc func) { + EXPECT_FP_IS_NAN(func(aNaN, aNaN)); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(sNaN, sNaN), FE_INVALID); + + InType qnan_42 = InFPBits::quiet_nan(Sign::POS, 0x42).get_val(); + EXPECT_FP_EQ(InType(0x42.0p+0), + LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, zero))); + EXPECT_FP_EQ(InType(0x42.0p+0), + LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_42))); + + if constexpr (sizeof(OutType) < sizeof(InType)) { + InStorageType max_payload = InFPBits::FRACTION_MASK >> 1; + InType qnan_max = InFPBits::quiet_nan(Sign::POS, max_payload).get_val(); + EXPECT_FP_EQ(zero, + LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, zero))); + EXPECT_FP_EQ(zero, + LIBC_NAMESPACE::fputil::getpayload(func(zero, qnan_max))); + EXPECT_FP_EQ(InType(0x42.0p+0), + LIBC_NAMESPACE::fputil::getpayload(func(qnan_max, qnan_42))); + EXPECT_FP_EQ(InType(0x42.0p+0), + LIBC_NAMESPACE::fputil::getpayload(func(qnan_42, qnan_max))); + } + + EXPECT_FP_EQ(inf, func(inf, InType(1.0))); + EXPECT_FP_EQ(neg_inf, func(neg_inf, InType(1.0))); + EXPECT_FP_EQ(neg_inf, func(inf, InType(-1.0))); + EXPECT_FP_EQ(inf, func(neg_inf, InType(-1.0))); + + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(zero, zero)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, func(neg_zero, neg_zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(zero, neg_zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, func(neg_zero, zero)); + + EXPECT_FP_EQ_ALL_ROUNDING(OutType(1.0), func(1.0, 1.0)); + EXPECT_FP_EQ_ALL_ROUNDING(OutType(15.0), func(3.0, 5.0)); + EXPECT_FP_EQ_ALL_ROUNDING(OutType(0x1.0p-13), func(0x1.0p+1, 0x1.0p-14)); + EXPECT_FP_EQ_ALL_ROUNDING(OutType(0x1.0p-10), func(0x1.0p+2, 0x1.0p-12)); + } + + void test_invalid_operations(MulFunc func) { + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(inf, neg_zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, zero), FE_INVALID); + EXPECT_FP_IS_NAN_WITH_EXCEPTION(func(neg_inf, neg_zero), FE_INVALID); + } + + void test_range_errors(MulFunc func) { + using namespace LIBC_NAMESPACE::fputil::testing; + + if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + } + + if (ForceRoundingMode r(RoundingMode::TowardZero); r.success) { + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, + func(neg_max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + } + + if (ForceRoundingMode r(RoundingMode::Downward); r.success) { + EXPECT_FP_EQ_WITH_EXCEPTION(max_normal, func(max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, func(neg_max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + + EXPECT_FP_EQ_WITH_EXCEPTION(zero, func(in.min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_min_denormal, + func(in.neg_min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + } + + if (ForceRoundingMode r(RoundingMode::Upward); r.success) { + EXPECT_FP_EQ_WITH_EXCEPTION(inf, func(max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_max_normal, + func(neg_max_normal, max_normal), + FE_OVERFLOW | FE_INEXACT); + + EXPECT_FP_EQ_WITH_EXCEPTION(min_denormal, + func(in.min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + EXPECT_FP_EQ_WITH_EXCEPTION(neg_zero, + func(in.neg_min_denormal, in.min_denormal), + FE_UNDERFLOW | FE_INEXACT); + EXPECT_MATH_ERRNO(ERANGE); + } + } + + void test_inexact_results(MulFunc func) { + InFPBits x_bits = InFPBits::one(); + x_bits.set_mantissa(InFPBits::SIG_MASK); + InType x = x_bits.get_val(); + func(x, x); + EXPECT_FP_EXCEPTION(FE_INEXACT); + } +}; + +#define LIST_MUL_TESTS(OutType, InType, func) \ + using LlvmLibcMulTest = MulTest; \ + TEST_F(LlvmLibcMulTest, SpecialNumbers) { test_special_numbers(&func); } \ + TEST_F(LlvmLibcMulTest, InvalidOperations) { \ + test_invalid_operations(&func); \ + } \ + TEST_F(LlvmLibcMulTest, RangeErrors) { test_range_errors(&func); } \ + TEST_F(LlvmLibcMulTest, InexactResults) { test_inexact_results(&func); } + +#endif // LLVM_LIBC_TEST_SRC_MATH_SMOKE_MULTEST_H diff --git a/libc/test/src/math/smoke/dmulf128_test.cpp b/libc/test/src/math/smoke/dmulf128_test.cpp new file mode 100644 index 0000000000000..2ee2d10b9a19b --- /dev/null +++ b/libc/test/src/math/smoke/dmulf128_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for dmulf128 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/dmulf128.h" + +LIST_MUL_TESTS(double, float128, LIBC_NAMESPACE::dmulf128) diff --git a/libc/test/src/math/smoke/dmull_test.cpp b/libc/test/src/math/smoke/dmull_test.cpp new file mode 100644 index 0000000000000..1b9c9c2c24ed3 --- /dev/null +++ b/libc/test/src/math/smoke/dmull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for dmull -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/dmull.h" + +LIST_MUL_TESTS(double, long double, LIBC_NAMESPACE::dmull) diff --git a/libc/test/src/math/smoke/f16mul_test.cpp b/libc/test/src/math/smoke/f16mul_test.cpp new file mode 100644 index 0000000000000..49b443870c483 --- /dev/null +++ b/libc/test/src/math/smoke/f16mul_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mul ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mul.h" + +LIST_MUL_TESTS(float16, double, LIBC_NAMESPACE::f16mul) diff --git a/libc/test/src/math/smoke/f16mulf128_test.cpp b/libc/test/src/math/smoke/f16mulf128_test.cpp new file mode 100644 index 0000000000000..46e52cf068a7b --- /dev/null +++ b/libc/test/src/math/smoke/f16mulf128_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mulf128 ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mulf128.h" + +LIST_MUL_TESTS(float16, float128, LIBC_NAMESPACE::f16mulf128) diff --git a/libc/test/src/math/smoke/f16mulf_test.cpp b/libc/test/src/math/smoke/f16mulf_test.cpp new file mode 100644 index 0000000000000..bf2530863621d --- /dev/null +++ b/libc/test/src/math/smoke/f16mulf_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mulf ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mulf.h" + +LIST_MUL_TESTS(float16, float, LIBC_NAMESPACE::f16mulf) diff --git a/libc/test/src/math/smoke/f16mull_test.cpp b/libc/test/src/math/smoke/f16mull_test.cpp new file mode 100644 index 0000000000000..5292ddb87b7f4 --- /dev/null +++ b/libc/test/src/math/smoke/f16mull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for f16mull ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/f16mull.h" + +LIST_MUL_TESTS(float16, long double, LIBC_NAMESPACE::f16mull) diff --git a/libc/test/src/math/smoke/fmul_test.cpp b/libc/test/src/math/smoke/fmul_test.cpp index 0eb664f7411ee..3f6df66456bac 100644 --- a/libc/test/src/math/smoke/fmul_test.cpp +++ b/libc/test/src/math/smoke/fmul_test.cpp @@ -1,13 +1,13 @@ -//===-- Unittests for fmul-------------------------------------------------===// +//===-- Unittests for fmul ------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -//===---------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// -#include "FMulTest.h" +#include "MulTest.h" #include "src/math/fmul.h" -LIST_FMUL_TESTS(float, double, LIBC_NAMESPACE::fmul) +LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul) diff --git a/libc/test/src/math/smoke/fmulf128_test.cpp b/libc/test/src/math/smoke/fmulf128_test.cpp new file mode 100644 index 0000000000000..37c8d1cf9908d --- /dev/null +++ b/libc/test/src/math/smoke/fmulf128_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmulf128 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/fmulf128.h" + +LIST_MUL_TESTS(float, float128, LIBC_NAMESPACE::fmulf128) diff --git a/libc/test/src/math/smoke/fmull_test.cpp b/libc/test/src/math/smoke/fmull_test.cpp new file mode 100644 index 0000000000000..ef694063f20dc --- /dev/null +++ b/libc/test/src/math/smoke/fmull_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmull -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MulTest.h" + +#include "src/math/fmull.h" + +LIST_MUL_TESTS(float, long double, LIBC_NAMESPACE::fmull) diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index b67a9da40bd7b..bb63e0b9f4de0 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -539,7 +539,7 @@ class MPFRNumber { return result; } - MPFRNumber fmul(const MPFRNumber &b) { + MPFRNumber mul(const MPFRNumber &b) { MPFRNumber result(*this); mpfr_mul(result.value, value, b.value, mpfr_rounding); return result; @@ -800,12 +800,12 @@ binary_operation_one_output(Operation op, InputType x, InputType y, return inputX.fmod(inputY); case Operation::Hypot: return inputX.hypot(inputY); + case Operation::Mul: + return inputX.mul(inputY); case Operation::Pow: return inputX.pow(inputY); case Operation::Sub: return inputX.sub(inputY); - case Operation::Fmul: - return inputX.fmul(inputY); default: __builtin_unreachable(); } @@ -1013,15 +1013,18 @@ void explain_binary_operation_one_output_error( template void explain_binary_operation_one_output_error(Operation, const BinaryInput &, float, double, RoundingMode); +template void explain_binary_operation_one_output_error( + Operation, const BinaryInput &, float, double, RoundingMode); template void explain_binary_operation_one_output_error( Operation, const BinaryInput &, double, double, RoundingMode); +template void explain_binary_operation_one_output_error( + Operation, const BinaryInput &, float, double, RoundingMode); +template void explain_binary_operation_one_output_error( + Operation, const BinaryInput &, double, double, RoundingMode); template void explain_binary_operation_one_output_error(Operation, const BinaryInput &, long double, double, RoundingMode); - -template void explain_binary_operation_one_output_error( - Operation, const BinaryInput &, float, double, RoundingMode); #ifdef LIBC_TYPES_HAS_FLOAT16 template void explain_binary_operation_one_output_error( Operation, const BinaryInput &, float16, double, RoundingMode); @@ -1195,6 +1198,12 @@ template bool compare_binary_operation_one_output(Operation, const BinaryInput &, double, double, RoundingMode); template bool +compare_binary_operation_one_output(Operation, const BinaryInput &, + float, double, RoundingMode); +template bool +compare_binary_operation_one_output(Operation, const BinaryInput &, + double, double, RoundingMode); +template bool compare_binary_operation_one_output(Operation, const BinaryInput &, long double, double, RoundingMode); diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index 28390af9ee6d8..8d51fa4e47726 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -79,9 +79,9 @@ enum class Operation : int { Div, Fmod, Hypot, + Mul, Pow, Sub, - Fmul, EndBinaryOperationsSingleOutput, // Operations which take two floating point numbers of the same type as From b37bdadbe784339e455915368a4893d3bd4a1193 Mon Sep 17 00:00:00 2001 From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com> Date: Thu, 18 Jul 2024 10:56:26 -0700 Subject: [PATCH 059/486] [libc] newheadergen: adding h_def_file arg to test (#99397) - spacing with _NOEXCEPT --- .../class_implementation/classes/function.py | 2 +- libc/newhdrgen/header.py | 8 +-- libc/newhdrgen/tests/output/test_small.h | 52 ------------------- libc/newhdrgen/tests/test_integration.py | 1 + 4 files changed, 6 insertions(+), 57 deletions(-) delete mode 100644 libc/newhdrgen/tests/output/test_small.h diff --git a/libc/newhdrgen/class_implementation/classes/function.py b/libc/newhdrgen/class_implementation/classes/function.py index ccfd93547c1d8..845ef1aebf54b 100644 --- a/libc/newhdrgen/class_implementation/classes/function.py +++ b/libc/newhdrgen/class_implementation/classes/function.py @@ -26,7 +26,7 @@ def __str__(self): attributes_str = " ".join(self.attributes) arguments_str = ", ".join(self.arguments) if attributes_str == "": - result = f"{self.return_type} {self.name}({arguments_str});" + result = f"{self.return_type} {self.name}({arguments_str})" else: result = f"{attributes_str} {self.return_type} {self.name}({arguments_str})" return result diff --git a/libc/newhdrgen/header.py b/libc/newhdrgen/header.py index 69de81eebb719..141e3c9b2736b 100644 --- a/libc/newhdrgen/header.py +++ b/libc/newhdrgen/header.py @@ -60,16 +60,16 @@ def __str__(self): current_guard = None for function in self.functions: if function.guard == None: - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + " __NOEXCEPT;") content.append("") else: if current_guard == None: current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + " __NOEXCEPT;") content.append("") elif current_guard == function.guard: - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + " __NOEXCEPT;") content.append("") else: content.pop() @@ -77,7 +77,7 @@ def __str__(self): content.append("") current_guard = function.guard content.append(f"#ifdef {current_guard}") - content.append(str(function) + "__NOEXCEPT") + content.append(str(function) + " __NOEXCEPT;") content.append("") if current_guard != None: content.pop() diff --git a/libc/newhdrgen/tests/output/test_small.h b/libc/newhdrgen/tests/output/test_small.h deleted file mode 100644 index a777976134b04..0000000000000 --- a/libc/newhdrgen/tests/output/test_small.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- C standard library header test_small-------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TEST_SMALL_H -#define LLVM_LIBC_TEST_SMALL_H - -#include "__llvm-libc-common.h" -#include "llvm-libc-macros/float16-macros.h" -#include "llvm-libc-macros/test_small-macros.h" -#include "llvm-libc-types/float128.h" - -#define MACRO_A 1 - -#define MACRO_B 2 - -#include -#include - -enum { - enum_a = value_1, - enum_b = value_2, -}; - -__BEGIN_C_DECLS - -CONST_FUNC_A void func_a() __NOEXCEPT; - -#ifdef LIBC_TYPES_HAS_FLOAT128 -float128 func_b() __NOEXCEPT; -#endif // LIBC_TYPES_HAS_FLOAT128 - -#ifdef LIBC_TYPES_HAS_FLOAT16 -_Float16 func_c(int, float) __NOEXCEPT; - -_Float16 func_d(int, float) __NOEXCEPT; -#endif // LIBC_TYPES_HAS_FLOAT16 - -#ifdef LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128 -_Float16 func_e(float128) __NOEXCEPT; -#endif // LIBC_TYPES_HAS_FLOAT16_AND_FLOAT128 - -extern obj object_1; -extern obj object_2; - -__END_C_DECLS - -#endif // LLVM_LIBC_TEST_SMALL_H diff --git a/libc/newhdrgen/tests/test_integration.py b/libc/newhdrgen/tests/test_integration.py index f12d18bc04a49..628a37b11c309 100644 --- a/libc/newhdrgen/tests/test_integration.py +++ b/libc/newhdrgen/tests/test_integration.py @@ -25,6 +25,7 @@ def run_script(self, yaml_file, h_def_file, output_dir): "python3", str(self.source_dir / "libc/newhdrgen/yaml_to_classes.py"), str(yaml_file), + "--h_def_file", str(h_def_file), "--output_dir", str(output_dir), From 78e3bfc120c8a23e246f544a5e9fb122828a21a7 Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Thu, 18 Jul 2024 20:04:21 +0200 Subject: [PATCH 060/486] [LLDB][test] Drop OS/HOST_OS detection code from Makefile.rules (#99535) Remove commands for OS/HOST_OS detection from Makefile.rules to simplify it, since logic for these variables has been implemented in `lldb/packages/Python/lldbsuite/test/lldbplatformutil.py` (7021e44b2f0e11717c0d82456bad0fed4a0b48f9). --- .../Python/lldbsuite/test/make/Makefile.rules | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 3d562285ce9cc..597aa94566c24 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -46,31 +46,6 @@ LLDB_BASE_DIR := $(THIS_FILE_DIR)/../../../../../ # according to variable values). .DEFAULT_GOAL := all -#---------------------------------------------------------------------- -# If OS is not defined, use 'uname -s' to determine the OS name. -# -# GNUWin32 uname gives "windows32" or "server version windows32" while -# some versions of MSYS uname return "MSYS_NT*", but most environments -# standardize on "Windows_NT", so we'll make it consistent here. -# When running tests from Visual Studio, the environment variable isn't -# inherited all the way down to the process spawned for make. -#---------------------------------------------------------------------- -ifeq "$(HOST_OS)" "" - HOST_OS := $(shell uname -s) -endif - -ifneq (,$(findstring windows32,$(HOST_OS))) - HOST_OS := Windows_NT -endif - -ifneq (,$(findstring MSYS_NT,$(HOST_OS))) - HOST_OS := Windows_NT -endif - -ifeq "$(OS)" "" - OS := $(HOST_OS) -endif - #---------------------------------------------------------------------- # If OS is Windows, force SHELL to be cmd # From c5432d31cb339262451215f6cf9c356a514a1770 Mon Sep 17 00:00:00 2001 From: vporpo Date: Thu, 18 Jul 2024 11:06:33 -0700 Subject: [PATCH 061/486] [SandboxIR][Tracker] Track eraseFromParent() (#99431) This patch adds tracking support for Instruction::eraseFromParent(). The Instruction is not actually being erased, but instead it is detached from the instruction list and drops its Use edges. The original instruction position and Use edges are saved in the `EraseFromParent` change object, and are being used during `revert()` to restore the original state. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 4 +- llvm/include/llvm/SandboxIR/Tracker.h | 40 +++++++++++++++- llvm/lib/SandboxIR/SandboxIR.cpp | 25 ++++++++-- llvm/lib/SandboxIR/Tracker.cpp | 59 ++++++++++++++++++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 52 +++++++++++++++++++++ 5 files changed, 173 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index c5d59ba47ca31..a9f0177eb9338 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -493,6 +493,7 @@ class Instruction : public sandboxir::User { /// \Returns the LLVM IR Instructions that this SandboxIR maps to in program /// order. virtual SmallVector getLLVMInstrs() const = 0; + friend class EraseFromParent; // For getLLVMInstrs(). public: static const char *getOpcodeName(Opcode Opc); @@ -658,6 +659,7 @@ class Context { friend void Instruction::eraseFromParent(); // For detach(). /// Take ownership of VPtr and store it in `LLVMValueToValueMap`. Value *registerValue(std::unique_ptr &&VPtr); + friend class EraseFromParent; // For registerValue(). /// This is the actual function that creates sandboxir values for \p V, /// and among others handles all instruction types. Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr); @@ -682,7 +684,7 @@ class Context { friend class BasicBlock; // For getOrCreateValue(). public: - Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx) {} + Context(LLVMContext &LLVMCtx) : LLVMCtx(LLVMCtx), IRTracker(*this) {} Tracker &getTracker() { return IRTracker; } /// Convenience function for `getTracker().save()` diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index 2d0904f5665b1..a91b9f178b8aa 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -40,6 +40,7 @@ #ifndef LLVM_SANDBOXIR_TRACKER_H #define LLVM_SANDBOXIR_TRACKER_H +#include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -99,6 +100,41 @@ class UseSet : public IRChangeBase { #endif }; +class EraseFromParent : public IRChangeBase { + /// Contains all the data we need to restore an "erased" (i.e., detached) + /// instruction: the instruction itself and its operands in order. + struct InstrAndOperands { + /// The operands that got dropped. + SmallVector Operands; + /// The instruction that got "erased". + llvm::Instruction *LLVMI; + }; + /// The instruction data is in reverse program order, which helps create the + /// original program order during revert(). + SmallVector InstrData; + /// This is either the next Instruction in the stream, or the parent + /// BasicBlock if at the end of the BB. + PointerUnion NextLLVMIOrBB; + /// We take ownership of the "erased" instruction. + std::unique_ptr ErasedIPtr; + +public: + EraseFromParent(std::unique_ptr &&IPtr, Tracker &Tracker); + void revert() final; + void accept() final; +#ifndef NDEBUG + void dump(raw_ostream &OS) const final { + dumpCommon(OS); + OS << "EraseFromParent"; + } + LLVM_DUMP_METHOD void dump() const final; + friend raw_ostream &operator<<(raw_ostream &OS, const EraseFromParent &C) { + C.dump(OS); + return OS; + } +#endif +}; + /// The tracker collects all the change objects and implements the main API for /// saving / reverting / accepting. class Tracker { @@ -116,6 +152,7 @@ class Tracker { #endif /// The current state of the tracker. TrackerState State = TrackerState::Disabled; + Context &Ctx; public: #ifndef NDEBUG @@ -124,8 +161,9 @@ class Tracker { bool InMiddleOfCreatingChange = false; #endif // NDEBUG - Tracker() = default; + explicit Tracker(Context &Ctx) : Ctx(Ctx) {} ~Tracker(); + Context &getContext() const { return Ctx; } /// Record \p Change and take ownership. This is the main function used to /// track Sandbox IR changes. void track(std::unique_ptr &&Change); diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 944869a37989c..c6daf1a586546 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -341,11 +341,26 @@ void Instruction::removeFromParent() { void Instruction::eraseFromParent() { assert(users().empty() && "Still connected to users, can't erase!"); std::unique_ptr Detached = Ctx.detach(this); - // We don't have Tracking yet, so just erase the LLVM IR instructions. - // Erase in reverse to avoid erasing nstructions with attached uses. - auto Instrs = getLLVMInstrs(); - for (llvm::Instruction *I : reverse(Instrs)) - I->eraseFromParent(); + auto LLVMInstrs = getLLVMInstrs(); + + auto &Tracker = Ctx.getTracker(); + if (Tracker.isTracking()) { + Tracker.track( + std::make_unique(std::move(Detached), Tracker)); + // We don't actually delete the IR instruction, because then it would be + // impossible to bring it back from the dead at the same memory location. + // Instead we remove it from its BB and track its current location. + for (llvm::Instruction *I : LLVMInstrs) + I->removeFromParent(); + // TODO: Multi-instructions need special treatment because some of the + // references are internal to the instruction. + for (llvm::Instruction *I : LLVMInstrs) + I->dropAllReferences(); + } else { + // Erase in reverse to avoid erasing nstructions with attached uses. + for (llvm::Instruction *I : reverse(LLVMInstrs)) + I->eraseFromParent(); + } } void Instruction::moveBefore(BasicBlock &BB, const BBIterator &WhereIt) { diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index 1182f5c55d10b..2336a0067abbd 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -41,6 +41,65 @@ Tracker::~Tracker() { assert(Changes.empty() && "You must accept or revert changes!"); } +EraseFromParent::EraseFromParent(std::unique_ptr &&ErasedIPtr, + Tracker &Tracker) + : IRChangeBase(Tracker), ErasedIPtr(std::move(ErasedIPtr)) { + auto *I = cast(this->ErasedIPtr.get()); + auto LLVMInstrs = I->getLLVMInstrs(); + // Iterate in reverse program order. + for (auto *LLVMI : reverse(LLVMInstrs)) { + SmallVector Operands; + Operands.reserve(LLVMI->getNumOperands()); + for (auto [OpNum, Use] : enumerate(LLVMI->operands())) + Operands.push_back(Use.get()); + InstrData.push_back({Operands, LLVMI}); + } + assert(is_sorted(InstrData, + [](const auto &D0, const auto &D1) { + return D0.LLVMI->comesBefore(D1.LLVMI); + }) && + "Expected reverse program order!"); + auto *BotLLVMI = cast(I->Val); + if (BotLLVMI->getNextNode() != nullptr) + NextLLVMIOrBB = BotLLVMI->getNextNode(); + else + NextLLVMIOrBB = BotLLVMI->getParent(); +} + +void EraseFromParent::accept() { + for (const auto &IData : InstrData) + IData.LLVMI->deleteValue(); +} + +void EraseFromParent::revert() { + // Place the bottom-most instruction first. + auto [Operands, BotLLVMI] = InstrData[0]; + if (auto *NextLLVMI = NextLLVMIOrBB.dyn_cast()) { + BotLLVMI->insertBefore(NextLLVMI); + } else { + auto *LLVMBB = NextLLVMIOrBB.get(); + BotLLVMI->insertInto(LLVMBB, LLVMBB->end()); + } + for (auto [OpNum, Op] : enumerate(Operands)) + BotLLVMI->setOperand(OpNum, Op); + + // Go over the rest of the instructions and stack them on top. + for (auto [Operands, LLVMI] : drop_begin(InstrData)) { + LLVMI->insertBefore(BotLLVMI); + for (auto [OpNum, Op] : enumerate(Operands)) + LLVMI->setOperand(OpNum, Op); + BotLLVMI = LLVMI; + } + Parent.getContext().registerValue(std::move(ErasedIPtr)); +} + +#ifndef NDEBUG +void EraseFromParent::dump() const { + dump(dbgs()); + dbgs() << "\n"; +} +#endif + void Tracker::track(std::unique_ptr &&Change) { assert(State == TrackerState::Record && "The tracker should be tracking!"); Changes.push_back(std::move(Change)); diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index f090dc521c32b..4e5dccf3a231f 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -146,3 +146,55 @@ define void @foo(ptr %ptr) { Ctx.accept(); EXPECT_EQ(St0->getOperand(0), Ld1); } + +// TODO: Test multi-instruction patterns. +TEST_F(TrackerTest, EraseFromParent) { + parseIR(C, R"IR( +define void @foo(i32 %v1) { + %add0 = add i32 %v1, %v1 + %add1 = add i32 %add0, %v1 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto *F = Ctx.createFunction(&LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + sandboxir::Instruction *Add0 = &*It++; + sandboxir::Instruction *Add1 = &*It++; + sandboxir::Instruction *Ret = &*It++; + + Ctx.save(); + // Check erase. + Add1->eraseFromParent(); + It = BB->begin(); + EXPECT_EQ(&*It++, Add0); + EXPECT_EQ(&*It++, Ret); + EXPECT_EQ(It, BB->end()); + EXPECT_EQ(Add0->getNumUses(), 0u); + + // Check revert(). + Ctx.revert(); + It = BB->begin(); + EXPECT_EQ(&*It++, Add0); + EXPECT_EQ(&*It++, Add1); + EXPECT_EQ(&*It++, Ret); + EXPECT_EQ(It, BB->end()); + EXPECT_EQ(Add1->getOperand(0), Add0); + + // Same for the last instruction in the block. + Ctx.save(); + Ret->eraseFromParent(); + It = BB->begin(); + EXPECT_EQ(&*It++, Add0); + EXPECT_EQ(&*It++, Add1); + EXPECT_EQ(It, BB->end()); + Ctx.revert(); + It = BB->begin(); + EXPECT_EQ(&*It++, Add0); + EXPECT_EQ(&*It++, Add1); + EXPECT_EQ(&*It++, Ret); + EXPECT_EQ(It, BB->end()); +} From 507c18b445ef88d985d95181db8107f669aed998 Mon Sep 17 00:00:00 2001 From: Piotr Zegar Date: Thu, 18 Jul 2024 18:19:11 +0000 Subject: [PATCH 062/486] [clang-tidy] Few tiny fixes after #99084 Update documentation, and correct configuration --- .../bugprone/UnusedReturnValueCheck.cpp | 12 +++--- .../checks/bugprone/unused-return-value.rst | 40 ++++++++++--------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp index 955a9b94dfaf6..1da5b222c2e8f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp @@ -138,13 +138,13 @@ UnusedReturnValueCheck::UnusedReturnValueCheck(llvm::StringRef Name, "^::sigismember$;" "^::strcasecmp$;" "^::strsignal$;" - "^::ttyname"))), + "^::ttyname$"))), CheckedReturnTypes(utils::options::parseStringList( - Options.get("CheckedReturnTypes", "::std::error_code$;" - "::std::error_condition$;" - "::std::errc$;" - "::std::expected$;" - "::boost::system::error_code"))), + Options.get("CheckedReturnTypes", "^::std::error_code$;" + "^::std::error_condition$;" + "^::std::errc$;" + "^::std::expected$;" + "^::boost::system::error_code$"))), AllowCastToVoid(Options.get("AllowCastToVoid", false)) {} UnusedReturnValueCheck::UnusedReturnValueCheck( diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst index 9205ba98729c4..10ae0fe3243a0 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst @@ -16,23 +16,26 @@ Options This parameter supports regexp. The function is checked if the name and scope matches, with any arguments. By default the following functions are checked: - ``::std::async$, ::std::launder$, ::std::remove$, ::std::remove_if$, ::std::unique$, - ::std::unique_ptr::release$, ::std::basic_string::empty$, ::std::vector::empty$, - ::std::back_inserter$, ::std::distance$, ::std::find$, ::std::find_if$, ::std::inserter$, - ::std::lower_bound$, ::std::make_pair$, ::std::map::count$, ::std::map::find$, - ::std::map::lower_bound$, ::std::multimap::equal_range$, ::std::multimap::upper_bound$, - ::std::set::count$, ::std::set::find$, ::std::setfill$, ::std::setprecision$, - ::std::setw$, ::std::upper_bound$, ::std::vector::at$, ::bsearch$, ::ferror$, - ::feof$, ::isalnum$, ::isalpha$, ::isblank$, ::iscntrl$, ::isdigit$, ::isgraph$, - ::islower$, ::isprint$, ::ispunct$, ::isspace$, ::isupper$, ::iswalnum$, ::iswprint$, - ::iswspace$, ::isxdigit$, ::memchr$, ::memcmp$, ::strcmp$, ::strcoll$, ::strncmp$, - ::strpbrk$, ::strrchr$, ::strspn$, ::strstr$, ::wcscmp$, ::access$, ::bind$, - ::connect$, ::difftime$, ::dlsym$, ::fnmatch$, ::getaddrinfo$, ::getopt$, - ::htonl$, ::htons$, ::iconv_open$, ::inet_addr$, isascii$, isatty$, ::mmap$, - ::newlocale$, ::openat$, ::pathconf$, ::pthread_equal$, ::pthread_getspecific$, - ::pthread_mutex_trylock$, ::readdir$, ::readlink$, ::recvmsg$, ::regexec$, ::scandir$, - ::semget$, ::setjmp$, ::shm_open$, ::shmget$, ::sigismember$, ::strcasecmp$, ::strsignal$, - ::ttyname$`` + ``^::std::async$, ^::std::launder$, ^::std::remove$, ^::std::remove_if$, + ^::std::unique$, ^::std::unique_ptr::release$, ^::std::basic_string::empty$, + ^::std::vector::empty$, ^::std::back_inserter$, ^::std::distance$, + ^::std::find$, ^::std::find_if$, ^::std::inserter$, ^::std::lower_bound$, + ^::std::make_pair$, ^::std::map::count$, ^::std::map::find$, + ^::std::map::lower_bound$, ^::std::multimap::equal_range$, + ^::std::multimap::upper_bound$, ^::std::set::count$, ^::std::set::find$, + ^::std::setfill$, ^::std::setprecision$, ^::std::setw$, ^::std::upper_bound$, + ^::std::vector::at$, ^::bsearch$, ^::ferror$, ^::feof$, ^::isalnum$, + ^::isalpha$, ^::isblank$, ^::iscntrl$, ^::isdigit$, ^::isgraph$, ^::islower$, + ^::isprint$, ^::ispunct$, ^::isspace$, ^::isupper$, ^::iswalnum$, + ^::iswprint$, ^::iswspace$, ^::isxdigit$, ^::memchr$, ^::memcmp$, ^::strcmp$, + ^::strcoll$, ^::strncmp$, ^::strpbrk$, ^::strrchr$, ^::strspn$, ^::strstr$, + ^::wcscmp$, ^::access$, ^::bind$, ^::connect$, ^::difftime$, ^::dlsym$, + ^::fnmatch$, ^::getaddrinfo$, ^::getopt$, ^::htonl$, ^::htons$, + ^::iconv_open$, ^::inet_addr$, isascii$, isatty$, ^::mmap$, ^::newlocale$, + ^::openat$, ^::pathconf$, ^::pthread_equal$, ^::pthread_getspecific$, + ^::pthread_mutex_trylock$, ^::readdir$, ^::readlink$, ^::recvmsg$, + ^::regexec$, ^::scandir$, ^::semget$, ^::setjmp$, ^::shm_open$, ^::shmget$, + ^::sigismember$, ^::strcasecmp$, ^::strsignal$, ^::ttyname$`` - ``std::async()``. Not using the return value makes the call synchronous. - ``std::launder()``. Not using the return value usually means that the @@ -54,7 +57,8 @@ Options Semicolon-separated list of function return types to check. By default the following function return types are checked: - `::std::error_code`, `::std::error_condition`, `::std::errc`, `::std::expected`, `::boost::system::error_code` + `^::std::error_code$`, `^::std::error_condition$`, `^::std::errc$`, + `^::std::expected$`, `^::boost::system::error_code$` .. option:: AllowCastToVoid From 5431a31f87387763cca8d014e7c07394bab7a1ad Mon Sep 17 00:00:00 2001 From: Rafael Stahl Date: Thu, 18 Jul 2024 20:21:19 +0200 Subject: [PATCH 063/486] [clang-tidy][NFC] Fix gsl::not_null template parameter (#99472) `T` is expected to be a pointer type. https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Rf-nullptr --- .../cppcoreguidelines/avoid-const-or-ref-data-members.rst | 2 +- .../cppcoreguidelines/avoid-const-or-ref-data-members.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst index 5783280478dc1..57c4829431e76 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.rst @@ -35,7 +35,7 @@ Examples: int* x; std::unique_ptr x; std::shared_ptr x; - gsl::not_null x; + gsl::not_null x; }; // Bad, rvalue reference member diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp index 5a5d05bb4e94e..e3864be134da3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp @@ -18,7 +18,7 @@ struct Ok { const int *pc; std::unique_ptr up; std::shared_ptr sp; - gsl::not_null n; + gsl::not_null n; }; struct ConstMember { @@ -60,7 +60,7 @@ struct Ok2 { const Foo *pc; std::unique_ptr up; std::shared_ptr sp; - gsl::not_null n; + gsl::not_null n; }; struct ConstMember2 { From 280d90d0fdb2734af6c071064c6f87a8fe8d06d0 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 18 Jul 2024 11:23:35 -0700 Subject: [PATCH 064/486] AMDGPU: Add back half and bfloat support for global_load_tr16 pats (#99540) half and bfloat are common types for 16-bit elements. The support of them was original there and dropped due to some reasons. This work adds the support of the float types back. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++ clang/lib/CodeGen/CGBuiltin.cpp | 10 +++- ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 15 ++++-- ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 6 ++- ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 6 ++- .../builtins-amdgcn-global-load-tr-w32.cl | 22 +++++++++ .../builtins-amdgcn-global-load-tr-w64.cl | 22 +++++++++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 ++ .../UniformityAnalysis/AMDGPU/intrinsics.ll | 36 ++++++++++++++ .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 48 +++++++++++++++++-- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 48 +++++++++++++++++-- 11 files changed, 207 insertions(+), 14 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 56bba448e12a4..e62315eea277a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -445,8 +445,12 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") //===----------------------------------------------------------------------===// // WMMA builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 67027f8aa93f3..2ad62d6ee0bb2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18725,7 +18725,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: { Intrinsic::ID IID; switch (BuiltinID) { @@ -18734,7 +18738,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 1fcb1d721ad72..8242ae6a98c40 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -5,13 +5,22 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr, + global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} - int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4y out_8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 7a36881c051b1..6f7a93ef897ac 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -4,9 +4,13 @@ // REQUIRES: amdgpu-registered-target typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr) +void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) { int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4y out_4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 9155ee6e61822..b7323f1b41c2a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -5,9 +5,13 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index ce8b2c2c7c5ba..186fc4eacfaaf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -4,6 +4,8 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( // CHECK-GFX1200-NEXT: entry: @@ -24,3 +26,23 @@ v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); } + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_global_load_tr_b128_v8bf16(global v8y* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index b0eed07627f41..b6627f1c8114d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -3,6 +3,8 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( // CHECK-GFX1200-NEXT: entry: @@ -23,3 +25,23 @@ v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); } + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] +// +v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]] +// +v4y test_amdgcn_global_load_tr_b128_v4bf16(global v4y* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr); +} diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 16dc019ede810..56cc6ffa19096 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1590,10 +1590,14 @@ let OtherPredicates = [isGFX12Plus] in { let WaveSizePredicate = isWave32 in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; + defm : GlobalFLATLoadPats ; + defm : GlobalFLATLoadPats ; } let WaveSizePredicate = isWave64 in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; + defm : GlobalFLATLoadPats ; + defm : GlobalFLATLoadPats ; } } diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 680c998a4b39f..b215fc2c2ae74 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -213,6 +213,22 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -229,6 +245,22 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr) + store <4 x half> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1 @@ -258,8 +290,12 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1)) declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) +declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1)) attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 291c249e4b738..0e659b758cd0f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -4,9 +4,11 @@ declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1)) -define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b64: +define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b64_v2i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -24,8 +26,8 @@ entry: ret void } -define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b128: +define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -42,3 +44,41 @@ entry: store <8 x i16> %val, ptr addrspace(1) %use ret void } + +define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8bf16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) + store <8 x bfloat> %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 12742f4f7127b..d941830e8dafc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -4,9 +4,11 @@ declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1)) +declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1)) -define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b64: +define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b64_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -24,8 +26,8 @@ entry: ret void } -define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b128: +define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -42,3 +44,41 @@ entry: store <4 x i16> %val, ptr addrspace(1) %use ret void } + +define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep) + store <4 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4bf16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) %gep) + store <4 x bfloat> %val, ptr addrspace(1) %use + ret void +} From 9527d77aefcf214944a4c8bd284dde3ffe9dff60 Mon Sep 17 00:00:00 2001 From: Angel Zhang Date: Thu, 18 Jul 2024 14:31:15 -0400 Subject: [PATCH 065/486] [mlir][spirv] Restructure code in `SPIRVConversion.cpp`. NFC. (#99393) --- .../SPIRV/Transforms/SPIRVConversion.cpp | 500 +++++++++--------- 1 file changed, 251 insertions(+), 249 deletions(-) diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp index e3a09ef1ff684..bf5044437fd09 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp @@ -40,6 +40,8 @@ using namespace mlir; +namespace { + //===----------------------------------------------------------------------===// // Utility functions //===----------------------------------------------------------------------===// @@ -171,18 +173,6 @@ static spirv::ScalarType getIndexType(MLIRContext *ctx, IntegerType::get(ctx, options.use64bitIndex ? 64 : 32)); } -Type SPIRVTypeConverter::getIndexType() const { - return ::getIndexType(getContext(), options); -} - -MLIRContext *SPIRVTypeConverter::getContext() const { - return targetEnv.getAttr().getContext(); -} - -bool SPIRVTypeConverter::allows(spirv::Capability capability) const { - return targetEnv.allows(capability); -} - // TODO: This is a utility function that should probably be exposed by the // SPIR-V dialect. Keeping it local till the use case arises. static std::optional @@ -673,9 +663,9 @@ static Type convertMemrefType(const spirv::TargetEnv &targetEnv, /// This function is meant to handle the **compute** side; so it does not /// involve storage classes in its logic. The storage side is expected to be /// handled by MemRef conversion logic. -std::optional castToSourceType(const spirv::TargetEnv &targetEnv, - OpBuilder &builder, Type type, - ValueRange inputs, Location loc) { +static std::optional castToSourceType(const spirv::TargetEnv &targetEnv, + OpBuilder &builder, Type type, + ValueRange inputs, Location loc) { // We can only cast one value in SPIR-V. if (inputs.size() != 1) { auto castOp = builder.create(loc, type, inputs); @@ -731,140 +721,185 @@ std::optional castToSourceType(const spirv::TargetEnv &targetEnv, } //===----------------------------------------------------------------------===// -// SPIRVTypeConverter +// Builtin Variables //===----------------------------------------------------------------------===// -SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr, - const SPIRVConversionOptions &options) - : targetEnv(targetAttr), options(options) { - // Add conversions. The order matters here: later ones will be tried earlier. +static spirv::GlobalVariableOp getBuiltinVariable(Block &body, + spirv::BuiltIn builtin) { + // Look through all global variables in the given `body` block and check if + // there is a spirv.GlobalVariable that has the same `builtin` attribute. + for (auto varOp : body.getOps()) { + if (auto builtinAttr = varOp->getAttrOfType( + spirv::SPIRVDialect::getAttributeName( + spirv::Decoration::BuiltIn))) { + auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue()); + if (varBuiltIn && *varBuiltIn == builtin) { + return varOp; + } + } + } + return nullptr; +} - // Allow all SPIR-V dialect specific types. This assumes all builtin types - // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType) - // were tried before. - // - // TODO: This assumes that the SPIR-V types are valid to use in the given - // target environment, which should be the case if the whole pipeline is - // driven by the same target environment. Still, we probably still want to - // validate and convert to be safe. - addConversion([](spirv::SPIRVType type) { return type; }); +/// Gets name of global variable for a builtin. +std::string getBuiltinVarName(spirv::BuiltIn builtin, StringRef prefix, + StringRef suffix) { + return Twine(prefix).concat(stringifyBuiltIn(builtin)).concat(suffix).str(); +} - addConversion([this](IndexType /*indexType*/) { return getIndexType(); }); +/// Gets or inserts a global variable for a builtin within `body` block. +static spirv::GlobalVariableOp +getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin, + Type integerType, OpBuilder &builder, + StringRef prefix, StringRef suffix) { + if (auto varOp = getBuiltinVariable(body, builtin)) + return varOp; - addConversion([this](IntegerType intType) -> std::optional { - if (auto scalarType = dyn_cast(intType)) - return convertScalarType(this->targetEnv, this->options, scalarType); - if (intType.getWidth() < 8) - return convertSubByteIntegerType(this->options, intType); - return Type(); - }); + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(&body); - addConversion([this](FloatType floatType) -> std::optional { - if (auto scalarType = dyn_cast(floatType)) - return convertScalarType(this->targetEnv, this->options, scalarType); - return Type(); - }); + spirv::GlobalVariableOp newVarOp; + switch (builtin) { + case spirv::BuiltIn::NumWorkgroups: + case spirv::BuiltIn::WorkgroupSize: + case spirv::BuiltIn::WorkgroupId: + case spirv::BuiltIn::LocalInvocationId: + case spirv::BuiltIn::GlobalInvocationId: { + auto ptrType = spirv::PointerType::get(VectorType::get({3}, integerType), + spirv::StorageClass::Input); + std::string name = getBuiltinVarName(builtin, prefix, suffix); + newVarOp = + builder.create(loc, ptrType, name, builtin); + break; + } + case spirv::BuiltIn::SubgroupId: + case spirv::BuiltIn::NumSubgroups: + case spirv::BuiltIn::SubgroupSize: { + auto ptrType = + spirv::PointerType::get(integerType, spirv::StorageClass::Input); + std::string name = getBuiltinVarName(builtin, prefix, suffix); + newVarOp = + builder.create(loc, ptrType, name, builtin); + break; + } + default: + emitError(loc, "unimplemented builtin variable generation for ") + << stringifyBuiltIn(builtin); + } + return newVarOp; +} - addConversion([this](ComplexType complexType) { - return convertComplexType(this->targetEnv, this->options, complexType); - }); +//===----------------------------------------------------------------------===// +// Push constant storage +//===----------------------------------------------------------------------===// - addConversion([this](VectorType vectorType) { - return convertVectorType(this->targetEnv, this->options, vectorType); - }); +/// Returns the pointer type for the push constant storage containing +/// `elementCount` 32-bit integer values. +static spirv::PointerType getPushConstantStorageType(unsigned elementCount, + Builder &builder, + Type indexType) { + auto arrayType = spirv::ArrayType::get(indexType, elementCount, + /*stride=*/4); + auto structType = spirv::StructType::get({arrayType}, /*offsetInfo=*/0); + return spirv::PointerType::get(structType, spirv::StorageClass::PushConstant); +} - addConversion([this](TensorType tensorType) { - return convertTensorType(this->targetEnv, this->options, tensorType); - }); +/// Returns the push constant varible containing `elementCount` 32-bit integer +/// values in `body`. Returns null op if such an op does not exit. +static spirv::GlobalVariableOp getPushConstantVariable(Block &body, + unsigned elementCount) { + for (auto varOp : body.getOps()) { + auto ptrType = dyn_cast(varOp.getType()); + if (!ptrType) + continue; - addConversion([this](MemRefType memRefType) { - return convertMemrefType(this->targetEnv, this->options, memRefType); - }); + // Note that Vulkan requires "There must be no more than one push constant + // block statically used per shader entry point." So we should always reuse + // the existing one. + if (ptrType.getStorageClass() == spirv::StorageClass::PushConstant) { + auto numElements = cast( + cast(ptrType.getPointeeType()) + .getElementType(0)) + .getNumElements(); + if (numElements == elementCount) + return varOp; + } + } + return nullptr; +} - // Register some last line of defense casting logic. - addSourceMaterialization( - [this](OpBuilder &builder, Type type, ValueRange inputs, Location loc) { - return castToSourceType(this->targetEnv, builder, type, inputs, loc); - }); - addTargetMaterialization([](OpBuilder &builder, Type type, ValueRange inputs, - Location loc) { - auto cast = builder.create(loc, type, inputs); - return std::optional(cast.getResult(0)); - }); +/// Gets or inserts a global variable for push constant storage containing +/// `elementCount` 32-bit integer values in `block`. +static spirv::GlobalVariableOp +getOrInsertPushConstantVariable(Location loc, Block &block, + unsigned elementCount, OpBuilder &b, + Type indexType) { + if (auto varOp = getPushConstantVariable(block, elementCount)) + return varOp; + + auto builder = OpBuilder::atBlockBegin(&block, b.getListener()); + auto type = getPushConstantStorageType(elementCount, builder, indexType); + const char *name = "__push_constant_var__"; + return builder.create(loc, type, name, + /*initializer=*/nullptr); } //===----------------------------------------------------------------------===// // func::FuncOp Conversion Patterns //===----------------------------------------------------------------------===// -namespace { /// A pattern for rewriting function signature to convert arguments of functions /// to be of valid SPIR-V types. -class FuncOpConversion final : public OpConversionPattern { -public: +struct FuncOpConversion final : OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override; -}; -} // namespace - -LogicalResult -FuncOpConversion::matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - auto fnType = funcOp.getFunctionType(); - if (fnType.getNumResults() > 1) - return failure(); - - TypeConverter::SignatureConversion signatureConverter(fnType.getNumInputs()); - for (const auto &argType : enumerate(fnType.getInputs())) { - auto convertedType = getTypeConverter()->convertType(argType.value()); - if (!convertedType) - return failure(); - signatureConverter.addInputs(argType.index(), convertedType); - } - - Type resultType; - if (fnType.getNumResults() == 1) { - resultType = getTypeConverter()->convertType(fnType.getResult(0)); - if (!resultType) + ConversionPatternRewriter &rewriter) const override { + FunctionType fnType = funcOp.getFunctionType(); + if (fnType.getNumResults() > 1) return failure(); - } - - // Create the converted spirv.func op. - auto newFuncOp = rewriter.create( - funcOp.getLoc(), funcOp.getName(), - rewriter.getFunctionType(signatureConverter.getConvertedTypes(), - resultType ? TypeRange(resultType) - : TypeRange())); - // Copy over all attributes other than the function name and type. - for (const auto &namedAttr : funcOp->getAttrs()) { - if (namedAttr.getName() != funcOp.getFunctionTypeAttrName() && - namedAttr.getName() != SymbolTable::getSymbolAttrName()) - newFuncOp->setAttr(namedAttr.getName(), namedAttr.getValue()); - } + TypeConverter::SignatureConversion signatureConverter( + fnType.getNumInputs()); + for (const auto &argType : enumerate(fnType.getInputs())) { + auto convertedType = getTypeConverter()->convertType(argType.value()); + if (!convertedType) + return failure(); + signatureConverter.addInputs(argType.index(), convertedType); + } - rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(), - newFuncOp.end()); - if (failed(rewriter.convertRegionTypes( - &newFuncOp.getBody(), *getTypeConverter(), &signatureConverter))) - return failure(); - rewriter.eraseOp(funcOp); - return success(); -} + Type resultType; + if (fnType.getNumResults() == 1) { + resultType = getTypeConverter()->convertType(fnType.getResult(0)); + if (!resultType) + return failure(); + } -void mlir::populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter, - RewritePatternSet &patterns) { - patterns.add(typeConverter, patterns.getContext()); -} + // Create the converted spirv.func op. + auto newFuncOp = rewriter.create( + funcOp.getLoc(), funcOp.getName(), + rewriter.getFunctionType(signatureConverter.getConvertedTypes(), + resultType ? TypeRange(resultType) + : TypeRange())); + + // Copy over all attributes other than the function name and type. + for (const auto &namedAttr : funcOp->getAttrs()) { + if (namedAttr.getName() != funcOp.getFunctionTypeAttrName() && + namedAttr.getName() != SymbolTable::getSymbolAttrName()) + newFuncOp->setAttr(namedAttr.getName(), namedAttr.getValue()); + } -//===----------------------------------------------------------------------===// -// func::FuncOp Conversion Patterns -//===----------------------------------------------------------------------===// + rewriter.inlineRegionBefore(funcOp.getBody(), newFuncOp.getBody(), + newFuncOp.end()); + if (failed(rewriter.convertRegionTypes( + &newFuncOp.getBody(), *getTypeConverter(), &signatureConverter))) + return failure(); + rewriter.eraseOp(funcOp); + return success(); + } +}; -namespace { /// A pattern for rewriting function signature to convert vector arguments of /// functions to be of valid types struct FuncOpVectorUnroll final : OpRewritePattern { @@ -1015,17 +1050,11 @@ struct FuncOpVectorUnroll final : OpRewritePattern { return success(); } }; -} // namespace - -void mlir::populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} //===----------------------------------------------------------------------===// // func::ReturnOp Conversion Patterns //===----------------------------------------------------------------------===// -namespace { /// A pattern for rewriting function signature and the return op to convert /// vectors to be of valid types. struct ReturnOpVectorUnroll final : OpRewritePattern { @@ -1097,81 +1126,13 @@ struct ReturnOpVectorUnroll final : OpRewritePattern { return success(); } }; -} // namespace -void mlir::populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} +} // namespace //===----------------------------------------------------------------------===// -// Builtin Variables +// Public function for builtin variables //===----------------------------------------------------------------------===// -static spirv::GlobalVariableOp getBuiltinVariable(Block &body, - spirv::BuiltIn builtin) { - // Look through all global variables in the given `body` block and check if - // there is a spirv.GlobalVariable that has the same `builtin` attribute. - for (auto varOp : body.getOps()) { - if (auto builtinAttr = varOp->getAttrOfType( - spirv::SPIRVDialect::getAttributeName( - spirv::Decoration::BuiltIn))) { - auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue()); - if (varBuiltIn && *varBuiltIn == builtin) { - return varOp; - } - } - } - return nullptr; -} - -/// Gets name of global variable for a builtin. -static std::string getBuiltinVarName(spirv::BuiltIn builtin, StringRef prefix, - StringRef suffix) { - return Twine(prefix).concat(stringifyBuiltIn(builtin)).concat(suffix).str(); -} - -/// Gets or inserts a global variable for a builtin within `body` block. -static spirv::GlobalVariableOp -getOrInsertBuiltinVariable(Block &body, Location loc, spirv::BuiltIn builtin, - Type integerType, OpBuilder &builder, - StringRef prefix, StringRef suffix) { - if (auto varOp = getBuiltinVariable(body, builtin)) - return varOp; - - OpBuilder::InsertionGuard guard(builder); - builder.setInsertionPointToStart(&body); - - spirv::GlobalVariableOp newVarOp; - switch (builtin) { - case spirv::BuiltIn::NumWorkgroups: - case spirv::BuiltIn::WorkgroupSize: - case spirv::BuiltIn::WorkgroupId: - case spirv::BuiltIn::LocalInvocationId: - case spirv::BuiltIn::GlobalInvocationId: { - auto ptrType = spirv::PointerType::get(VectorType::get({3}, integerType), - spirv::StorageClass::Input); - std::string name = getBuiltinVarName(builtin, prefix, suffix); - newVarOp = - builder.create(loc, ptrType, name, builtin); - break; - } - case spirv::BuiltIn::SubgroupId: - case spirv::BuiltIn::NumSubgroups: - case spirv::BuiltIn::SubgroupSize: { - auto ptrType = - spirv::PointerType::get(integerType, spirv::StorageClass::Input); - std::string name = getBuiltinVarName(builtin, prefix, suffix); - newVarOp = - builder.create(loc, ptrType, name, builtin); - break; - } - default: - emitError(loc, "unimplemented builtin variable generation for ") - << stringifyBuiltIn(builtin); - } - return newVarOp; -} - Value mlir::spirv::getBuiltinVariableValue(Operation *op, spirv::BuiltIn builtin, Type integerType, OpBuilder &builder, @@ -1190,60 +1151,9 @@ Value mlir::spirv::getBuiltinVariableValue(Operation *op, } //===----------------------------------------------------------------------===// -// Push constant storage +// Public function for pushing constant storage //===----------------------------------------------------------------------===// -/// Returns the pointer type for the push constant storage containing -/// `elementCount` 32-bit integer values. -static spirv::PointerType getPushConstantStorageType(unsigned elementCount, - Builder &builder, - Type indexType) { - auto arrayType = spirv::ArrayType::get(indexType, elementCount, - /*stride=*/4); - auto structType = spirv::StructType::get({arrayType}, /*offsetInfo=*/0); - return spirv::PointerType::get(structType, spirv::StorageClass::PushConstant); -} - -/// Returns the push constant varible containing `elementCount` 32-bit integer -/// values in `body`. Returns null op if such an op does not exit. -static spirv::GlobalVariableOp getPushConstantVariable(Block &body, - unsigned elementCount) { - for (auto varOp : body.getOps()) { - auto ptrType = dyn_cast(varOp.getType()); - if (!ptrType) - continue; - - // Note that Vulkan requires "There must be no more than one push constant - // block statically used per shader entry point." So we should always reuse - // the existing one. - if (ptrType.getStorageClass() == spirv::StorageClass::PushConstant) { - auto numElements = cast( - cast(ptrType.getPointeeType()) - .getElementType(0)) - .getNumElements(); - if (numElements == elementCount) - return varOp; - } - } - return nullptr; -} - -/// Gets or inserts a global variable for push constant storage containing -/// `elementCount` 32-bit integer values in `block`. -static spirv::GlobalVariableOp -getOrInsertPushConstantVariable(Location loc, Block &block, - unsigned elementCount, OpBuilder &b, - Type indexType) { - if (auto varOp = getPushConstantVariable(block, elementCount)) - return varOp; - - auto builder = OpBuilder::atBlockBegin(&block, b.getListener()); - auto type = getPushConstantStorageType(elementCount, builder, indexType); - const char *name = "__push_constant_var__"; - return builder.create(loc, type, name, - /*initializer=*/nullptr); -} - Value spirv::getPushConstantValue(Operation *op, unsigned elementCount, unsigned offset, Type integerType, OpBuilder &builder) { @@ -1267,7 +1177,7 @@ Value spirv::getPushConstantValue(Operation *op, unsigned elementCount, } //===----------------------------------------------------------------------===// -// Index calculation +// Public functions for index calculation //===----------------------------------------------------------------------===// Value mlir::spirv::linearizeIndex(ValueRange indices, ArrayRef strides, @@ -1375,6 +1285,81 @@ Value mlir::spirv::getElementPtr(const SPIRVTypeConverter &typeConverter, builder); } +//===----------------------------------------------------------------------===// +// SPIR-V TypeConverter +//===----------------------------------------------------------------------===// + +SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr, + const SPIRVConversionOptions &options) + : targetEnv(targetAttr), options(options) { + // Add conversions. The order matters here: later ones will be tried earlier. + + // Allow all SPIR-V dialect specific types. This assumes all builtin types + // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType) + // were tried before. + // + // TODO: This assumes that the SPIR-V types are valid to use in the given + // target environment, which should be the case if the whole pipeline is + // driven by the same target environment. Still, we probably still want to + // validate and convert to be safe. + addConversion([](spirv::SPIRVType type) { return type; }); + + addConversion([this](IndexType /*indexType*/) { return getIndexType(); }); + + addConversion([this](IntegerType intType) -> std::optional { + if (auto scalarType = dyn_cast(intType)) + return convertScalarType(this->targetEnv, this->options, scalarType); + if (intType.getWidth() < 8) + return convertSubByteIntegerType(this->options, intType); + return Type(); + }); + + addConversion([this](FloatType floatType) -> std::optional { + if (auto scalarType = dyn_cast(floatType)) + return convertScalarType(this->targetEnv, this->options, scalarType); + return Type(); + }); + + addConversion([this](ComplexType complexType) { + return convertComplexType(this->targetEnv, this->options, complexType); + }); + + addConversion([this](VectorType vectorType) { + return convertVectorType(this->targetEnv, this->options, vectorType); + }); + + addConversion([this](TensorType tensorType) { + return convertTensorType(this->targetEnv, this->options, tensorType); + }); + + addConversion([this](MemRefType memRefType) { + return convertMemrefType(this->targetEnv, this->options, memRefType); + }); + + // Register some last line of defense casting logic. + addSourceMaterialization( + [this](OpBuilder &builder, Type type, ValueRange inputs, Location loc) { + return castToSourceType(this->targetEnv, builder, type, inputs, loc); + }); + addTargetMaterialization([](OpBuilder &builder, Type type, ValueRange inputs, + Location loc) { + auto cast = builder.create(loc, type, inputs); + return std::optional(cast.getResult(0)); + }); +} + +Type SPIRVTypeConverter::getIndexType() const { + return ::getIndexType(getContext(), options); +} + +MLIRContext *SPIRVTypeConverter::getContext() const { + return targetEnv.getAttr().getContext(); +} + +bool SPIRVTypeConverter::allows(spirv::Capability capability) const { + return targetEnv.allows(capability); +} + //===----------------------------------------------------------------------===// // SPIR-V ConversionTarget //===----------------------------------------------------------------------===// @@ -1468,3 +1453,20 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) { return true; } + +//===----------------------------------------------------------------------===// +// Public functions for populating patterns +//===----------------------------------------------------------------------===// + +void mlir::populateBuiltinFuncToSPIRVPatterns(SPIRVTypeConverter &typeConverter, + RewritePatternSet &patterns) { + patterns.add(typeConverter, patterns.getContext()); +} + +void mlir::populateFuncOpVectorRewritePatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +void mlir::populateReturnOpVectorRewritePatterns(RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} From f6f88f4b99638821af803d1911ab6a7dac04880b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 18 Jul 2024 13:36:55 -0500 Subject: [PATCH 066/486] [LLVM] Silence compiler-rt warning in runtimes build (#99525) Summary: The `compiler-rt` project wants `LLVM_CMAKE_DIR` but the `llvm_ExternalProject_add` interface sets the `LLVM_CONFIG_PATH`. This patch just makes the utility pass that as well. --- llvm/cmake/modules/LLVMExternalProjectUtils.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake index eef0c16f6847e..cd071d50bdce9 100644 --- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake +++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake @@ -350,6 +350,7 @@ function(llvm_ExternalProject_Add name source_dir) ${sysroot_arg} -DLLVM_BINARY_DIR=${PROJECT_BINARY_DIR} -DLLVM_CONFIG_PATH=${llvm_config_path} + -DLLVM_CMAKE_DIR=${LLVM_CMAKE_DIR} -DLLVM_ENABLE_WERROR=${LLVM_ENABLE_WERROR} -DLLVM_HOST_TRIPLE=${LLVM_HOST_TRIPLE} -DLLVM_HAVE_LINK_VERSION_SCRIPT=${LLVM_HAVE_LINK_VERSION_SCRIPT} From 5e8cd29d62a72ed18e7bc782554d7f14eccec0ee Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 18 Jul 2024 11:32:56 -0700 Subject: [PATCH 067/486] [RISCV] Add coverage for vector combine reduce(cast x) transformation This covers both the existing trunc transform - basically checking that it performs sanely with the RISCV cost model - and a planned change to handle sext/zext as well. --- .../VectorCombine/RISCV/vecreduce-of-cast.ll | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll b/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll new file mode 100644 index 0000000000000..9b1aa19f85c21 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/RISCV/vecreduce-of-cast.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s + +; +; Fold reduce(cast(X)) -> trunc(cast(X)) if more cost efficient +; + +define i32 @reduce_add_trunc_v8i64_to_v8i32(<8 x i64> %a0) { +; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i32( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %tr) + ret i32 %red +} + +define i16 @reduce_add_trunc_v8i64_to_v8i16(<8 x i64> %a0) { +; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i16( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i16 +; CHECK-NEXT: ret i16 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i16> + %red = tail call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %tr) + ret i16 %red +} + +define i8 @reduce_add_trunc_v8i64_to_v8i8(<8 x i64> %a0) { +; CHECK-LABEL: @reduce_add_trunc_v8i64_to_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i8> + %red = tail call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %tr) + ret i8 %red +} + +define i8 @reduce_or_trunc_v8i32_i8(<8 x i32> %a0) { +; CHECK-LABEL: @reduce_or_trunc_v8i32_i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i32 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <8 x i32> %a0 to <8 x i8> + %red = tail call i8 @llvm.vector.reduce.or.v8i32(<8 x i8> %tr) + ret i8 %red +} + +define i8 @reduce_xor_trunc_v16i64_i8(<16 x i64> %a0) { +; CHECK-LABEL: @reduce_xor_trunc_v16i64_i8( +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> [[A0:%.*]]) +; CHECK-NEXT: [[RED:%.*]] = trunc i64 [[TMP1]] to i8 +; CHECK-NEXT: ret i8 [[RED]] +; + %tr = trunc <16 x i64> %a0 to <16 x i8> + %red = tail call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %tr) + ret i8 %red +} + +define i16 @reduce_mul_trunc_v8i64_i16(<8 x i64> %a0) { +; CHECK-LABEL: @reduce_mul_trunc_v8i64_i16( +; CHECK-NEXT: [[TR:%.*]] = trunc <8 x i64> [[A0:%.*]] to <8 x i16> +; CHECK-NEXT: [[RED:%.*]] = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[TR]]) +; CHECK-NEXT: ret i16 [[RED]] +; + %tr = trunc <8 x i64> %a0 to <8 x i16> + %red = tail call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %tr) + ret i16 %red +} + +define i32 @reduce_or_sext_v8i8_to_v8i32(<8 x i8> %a0) { +; CHECK-LABEL: @reduce_or_sext_v8i8_to_v8i32( +; CHECK-NEXT: [[TR:%.*]] = sext <8 x i8> [[A0:%.*]] to <8 x i32> +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TR]]) +; CHECK-NEXT: ret i32 [[RED]] +; + %tr = sext <8 x i8> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr) + ret i32 %red +} + +define i32 @reduce_or_sext_v8i16_to_v8i32(<8 x i16> %a0) { +; CHECK-LABEL: @reduce_or_sext_v8i16_to_v8i32( +; CHECK-NEXT: [[TR:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32> +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TR]]) +; CHECK-NEXT: ret i32 [[RED]] +; + %tr = sext <8 x i16> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr) + ret i32 %red +} + +define i32 @reduce_or_zext_v8i8_to_v8i32(<8 x i8> %a0) { +; CHECK-LABEL: @reduce_or_zext_v8i8_to_v8i32( +; CHECK-NEXT: [[TR:%.*]] = zext <8 x i8> [[A0:%.*]] to <8 x i32> +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TR]]) +; CHECK-NEXT: ret i32 [[RED]] +; + %tr = zext <8 x i8> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr) + ret i32 %red +} + +define i32 @reduce_or_zext_v8i16_to_v8i32(<8 x i16> %a0) { +; CHECK-LABEL: @reduce_or_zext_v8i16_to_v8i32( +; CHECK-NEXT: [[TR:%.*]] = zext <8 x i16> [[A0:%.*]] to <8 x i32> +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TR]]) +; CHECK-NEXT: ret i32 [[RED]] +; + %tr = zext <8 x i16> %a0 to <8 x i32> + %red = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %tr) + ret i32 %red +} + +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) + From eed72d4381261bfe1acb693fb8751c05765c4831 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Thu, 18 Jul 2024 20:13:45 +0100 Subject: [PATCH 068/486] [mlir][ArmSME] Support filling liveness 'holes' in the tile allocator (#98350) Holes in a live range are points where the corresponding value does not need to be in a tile/register. If the tile allocator keeps track of these holes it can reuse tiles for more values (avoiding spills). Take this simple example: ```mlir func.func @example(%cond: i1) { %tileA = arm_sme.get_tile : vector<[4]x[4]xf32> cf.cond_br %cond, ^bb2, ^bb1 ^bb1: // If we end up here we never use %tileA again! "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> () cf.br ^bb3 ^bb2: "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> () cf.br ^bb3 ^bb3: return } ``` If you were to calculate the liveness of %tileA and %tileB. You'd see there is a hole in the liveness of %tileA in bb1: ``` %tileA %tileB ^bb0: Live ^bb1: Live ^bb2: Live ``` The tile allocator can make use of that hole and reuse the tile ID it assigned to %tileA for %tileB. --- .../ArmSME/Transforms/TileAllocation.cpp | 134 ++++++++++--- .../ArmSME/tile-allocation-liveness.mlir | 178 ++++++++++++++++++ 2 files changed, 283 insertions(+), 29 deletions(-) diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp index 733e758b43907..5cac770b03ed1 100644 --- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp +++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp @@ -153,10 +153,18 @@ class TileAllocator { return failure(); } + /// Acquires a specific tile ID. Asserts the tile is initially free. + void acquireTileId(ArmSMETileType tileType, unsigned tileId) { + TileMask tileMask = getMasks(tileType)[tileId]; + assert((tilesInUse & tileMask) == TileMask::kNone && + "cannot acquire allocated tile!"); + tilesInUse |= tileMask; + } + /// Releases a previously allocated tile ID. void releaseTileId(ArmSMETileType tileType, unsigned tileId) { TileMask tileMask = getMasks(tileType)[tileId]; - assert((tilesInUse & tileMask) != TileMask::kNone && + assert((tilesInUse & tileMask) == tileMask && "cannot release unallocated tile!"); tilesInUse ^= tileMask; } @@ -289,6 +297,11 @@ struct LiveRange { .valid(); } + /// Returns true if this range is active at `point` in the program. + bool overlaps(uint64_t point) const { + return ranges->lookup(point) == kValidLiveRange; + } + /// Unions this live range with `otherRange`, aborts if the ranges overlap. void unionWith(LiveRange const &otherRange) { for (auto it = otherRange.ranges->begin(); it != otherRange.ranges->end(); @@ -488,76 +501,139 @@ coalesceTileLiveRanges(DenseMap &initialLiveRanges) { return std::move(coalescedLiveRanges); } -/// Choose a live range to spill (via some heuristics). This picks either an -/// active live range from `activeRanges` or the new live range `newRange`. -LiveRange *chooseSpillUsingHeuristics(ArrayRef activeRanges, - LiveRange *newRange) { +/// Choose a live range to spill (via some heuristics). This picks either a live +/// range from `overlappingRanges`, or the new live range `newRange`. +template +LiveRange * +chooseSpillUsingHeuristics(OverlappingRangesIterator overlappingRanges, + LiveRange *newRange) { // Heuristic: Spill trivially copyable operations (usually free). - auto isTrivialSpill = [&](LiveRange *allocatedRange) { - return isTileTypeGreaterOrEqual(allocatedRange->getTileType(), + auto isTrivialSpill = [&](LiveRange &allocatedRange) { + return isTileTypeGreaterOrEqual(allocatedRange.getTileType(), newRange->getTileType()) && - allocatedRange->values.size() == 1 && + allocatedRange.values.size() == 1 && isTriviallyCloneableTileOp( - allocatedRange->values[0] - .getDefiningOp()); + allocatedRange.values[0].getDefiningOp()); }; - if (isTrivialSpill(newRange)) + if (isTrivialSpill(*newRange)) return newRange; - auto trivialSpill = llvm::find_if(activeRanges, isTrivialSpill); - if (trivialSpill != activeRanges.end()) - return *trivialSpill; + auto trivialSpill = llvm::find_if(overlappingRanges, isTrivialSpill); + if (trivialSpill != overlappingRanges.end()) + return &*trivialSpill; // Heuristic: Spill the range that ends last (with a compatible tile type). - auto isSmallerTileTypeOrEndsEarlier = [](LiveRange *a, LiveRange *b) { - return !isTileTypeGreaterOrEqual(a->getTileType(), b->getTileType()) || - a->end() < b->end(); + auto isSmallerTileTypeOrEndsEarlier = [](LiveRange &a, LiveRange &b) { + return !isTileTypeGreaterOrEqual(a.getTileType(), b.getTileType()) || + a.end() < b.end(); }; - LiveRange *lastActiveLiveRange = *std::max_element( - activeRanges.begin(), activeRanges.end(), isSmallerTileTypeOrEndsEarlier); - if (!isSmallerTileTypeOrEndsEarlier(lastActiveLiveRange, newRange)) - return lastActiveLiveRange; + LiveRange &latestEndingLiveRange = + *std::max_element(overlappingRanges.begin(), overlappingRanges.end(), + isSmallerTileTypeOrEndsEarlier); + if (!isSmallerTileTypeOrEndsEarlier(latestEndingLiveRange, *newRange)) + return &latestEndingLiveRange; return newRange; } /// Greedily allocate tile IDs to live ranges. Spill using simple heuristics. -/// Note: This does not attempt to fill holes in active live ranges. void allocateTilesToLiveRanges( ArrayRef liveRangesSortedByStartPoint) { TileAllocator tileAllocator; + // `activeRanges` = Live ranges that need to be in a tile at the + // `currentPoint` in the program. SetVector activeRanges; + // `inactiveRanges` = Live ranges that _do not_ need to be in a tile + // at the `currentPoint` in the program but could become active again later. + // An inactive section of a live range can be seen as a 'hole' in the live + // range, where it is possible to reuse the live range's tile ID _before_ it + // has ended. By identifying 'holes', the allocator can reuse tiles more + // often, which helps avoid costly tile spills. + SetVector inactiveRanges; for (LiveRange *nextRange : liveRangesSortedByStartPoint) { - // Release tile IDs from live ranges that have ended. + auto currentPoint = nextRange->start(); + // 1. Update the `activeRanges` at `currentPoint`. activeRanges.remove_if([&](LiveRange *activeRange) { - if (activeRange->end() <= nextRange->start()) { + // Check for live ranges that have expired. + if (activeRange->end() <= currentPoint) { tileAllocator.releaseTileId(activeRange->getTileType(), *activeRange->tileId); return true; } + // Check for live ranges that have become inactive. + if (!activeRange->overlaps(currentPoint)) { + tileAllocator.releaseTileId(activeRange->getTileType(), + *activeRange->tileId); + inactiveRanges.insert(activeRange); + return true; + } return false; }); + // 2. Update the `inactiveRanges` at `currentPoint`. + inactiveRanges.remove_if([&](LiveRange *inactiveRange) { + // Check for live ranges that have expired. + if (inactiveRange->end() <= currentPoint) { + return true; + } + // Check for live ranges that have become active. + if (inactiveRange->overlaps(currentPoint)) { + tileAllocator.acquireTileId(inactiveRange->getTileType(), + *inactiveRange->tileId); + activeRanges.insert(inactiveRange); + return true; + } + return false; + }); + + // 3. Collect inactive live ranges that overlap with the new live range. + // Note: The overlap checks in steps 1 and 2 only look at the `currentPoint` + // whereas this checks if there is an overlap at any future point too. + SmallVector overlappingInactiveRanges; + for (LiveRange *inactiveRange : inactiveRanges) { + if (inactiveRange->overlaps(*nextRange)) { + // We need to reserve the tile IDs of overlapping inactive ranges to + // prevent two (overlapping) live ranges from getting the same tile ID. + tileAllocator.acquireTileId(inactiveRange->getTileType(), + *inactiveRange->tileId); + overlappingInactiveRanges.push_back(inactiveRange); + } + } - // Allocate a tile ID to `nextRange`. + // 4. Allocate a tile ID to `nextRange`. auto rangeTileType = nextRange->getTileType(); auto tileId = tileAllocator.allocateTileId(rangeTileType); if (succeeded(tileId)) { nextRange->tileId = *tileId; } else { + // Create an iterator over all overlapping live ranges. + auto allOverlappingRanges = llvm::concat( + llvm::make_pointee_range(activeRanges.getArrayRef()), + llvm::make_pointee_range(overlappingInactiveRanges)); + // Choose an overlapping live range to spill. LiveRange *rangeToSpill = - chooseSpillUsingHeuristics(activeRanges.getArrayRef(), nextRange); + chooseSpillUsingHeuristics(allOverlappingRanges, nextRange); if (rangeToSpill != nextRange) { - // Spill an active live range (so release its tile ID first). + // Spill an (in)active live range (so release its tile ID first). tileAllocator.releaseTileId(rangeToSpill->getTileType(), *rangeToSpill->tileId); - activeRanges.remove(rangeToSpill); // This will always succeed after a spill (of an active live range). nextRange->tileId = *tileAllocator.allocateTileId(rangeTileType); + // Remove the live range from the active/inactive sets. + if (!activeRanges.remove(rangeToSpill)) { + bool removed = inactiveRanges.remove(rangeToSpill); + assert(removed && "expected a range to be removed!"); + } } rangeToSpill->tileId = tileAllocator.allocateInMemoryTileId(); } - // Insert the live range into the active ranges. + // 5. Insert the live range into the active ranges. if (nextRange->tileId < kInMemoryTileIdBase) activeRanges.insert(nextRange); + + // 6. Release tiles reserved for inactive live ranges (in step 3). + for (LiveRange *range : overlappingInactiveRanges) { + if (*range->tileId < kInMemoryTileIdBase) + tileAllocator.releaseTileId(range->getTileType(), *range->tileId); + } } } diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir index 9c22b29ac22e7..2e1f3d1ee10a9 100644 --- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir +++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir @@ -430,3 +430,181 @@ func.func @cond_branch_with_backedge(%slice: vector<[4]xf32>) { // Live here: %finalTileA, %finalTileB, %finalTileC, %finalTileD return } + +// ----- + +// CHECK-LIVE-RANGE-LABEL: @fill_holes_in_tile_liveness +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: E cf.cond_br +// CHECK-LIVE-RANGE-NEXT: ^bb1: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: cf.br +// CHECK-LIVE-RANGE-NEXT: ^bb2: +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: cf.br + +// Here there's a 'hole' in the liveness of %tileA (in bb1) where another value +// can reuse the tile ID assigned to %tileA. The liveness for %tileB is +// entirely within the 'hole' in %tileA's live range, so %tileB should get the +// same tile ID as %tileA. + +// CHECK-LABEL: @fill_holes_in_tile_liveness +func.func @fill_holes_in_tile_liveness(%cond: i1) { + // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A:.*]] : i32} + %tileA = arm_sme.get_tile : vector<[4]x[4]xf32> + cf.cond_br %cond, ^bb2, ^bb1 +^bb1: + // CHECK: arm_sme.get_tile {tile_id = [[TILE_ID_A]] : i32} + %tileB = arm_sme.get_tile : vector<[4]x[4]xf32> + "test.dummy"(): () -> () + "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> () + cf.br ^bb3 +^bb2: + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> () + cf.br ^bb3 +^bb3: + return +} + +// ----- + +// CHECK-LIVE-RANGE-LABEL: @holes_in_tile_liveness_inactive_overlaps +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: E cf.cond_br +// CHECK-LIVE-RANGE-NEXT: ^bb1: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: | test.some_use +// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: E cf.br +// CHECK-LIVE-RANGE-NEXT: ^bb2: +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: |S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: E| test.some_use +// CHECK-LIVE-RANGE-NEXT: | arm_sme.copy_tile +// CHECK-LIVE-RANGE-NEXT: E cf.br +// CHECK-LIVE-RANGE-NEXT: ^bb3: +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: func.return + +// This tests an edge case in inactive live ranges. The first live range is +// inactive at the start of ^bb1. If the tile allocator did not check if the +// second live range overlapped the first it would wrongly re-use tile ID 0 +// (as the first live range is inactive so tile ID 0 is free). This would mean +// in ^bb2 two overlapping live ranges would have the same tile ID (bad!). + +// CHECK-LABEL: @holes_in_tile_liveness_inactive_overlaps +func.func @holes_in_tile_liveness_inactive_overlaps(%cond: i1) { + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + %tileA = arm_sme.get_tile : vector<[4]x[4]xf32> + cf.cond_br %cond, ^bb2, ^bb1 +^bb1: + // CHECK: arm_sme.get_tile {tile_id = 1 : i32} + %tileB = arm_sme.get_tile : vector<[4]x[4]xf32> + "test.dummy"(): () -> () + "test.some_use"(%tileB) : (vector<[4]x[4]xf32>) -> () + cf.br ^bb3(%tileB: vector<[4]x[4]xf32>) +^bb2: + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.dummy"(): () -> () + // CHECK: arm_sme.get_tile {tile_id = 1 : i32} + %tileC = arm_sme.get_tile : vector<[4]x[4]xf32> + "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> () + cf.br ^bb3(%tileC: vector<[4]x[4]xf32>) +^bb3(%tile: vector<[4]x[4]xf32>): + "test.some_use"(%tile) : (vector<[4]x[4]xf32>) -> () + return +} + +// ----- + +// This is the same as the previous example, but changes the tile types to +// vector<[16]x[16]xi8>. This means in bb1 the allocator will need to spill the +// first live range (which is inactive). + +// Note: The live ranges are the same as the previous example (so are not checked). + +// CHECK-LABEL: @spill_inactive_live_range +func.func @spill_inactive_live_range(%cond: i1) { + // CHECK: arm_sme.get_tile {tile_id = 16 : i32} + %tileA = arm_sme.get_tile : vector<[16]x[16]xi8> + cf.cond_br %cond, ^bb2, ^bb1 +^bb1: + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + %tileB = arm_sme.get_tile : vector<[16]x[16]xi8> + "test.dummy"(): () -> () + "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> () + cf.br ^bb3(%tileB: vector<[16]x[16]xi8>) +^bb2: + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.dummy"(): () -> () + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + %tileC = arm_sme.get_tile : vector<[16]x[16]xi8> + "test.some_use"(%tileA) : (vector<[16]x[16]xi8>) -> () + cf.br ^bb3(%tileC: vector<[16]x[16]xi8>) +^bb3(%tile: vector<[16]x[16]xi8>): + "test.some_use"(%tile) : (vector<[16]x[16]xi8>) -> () + return +} + +// ----- + +// CHECK-LIVE-RANGE-LABEL: @reactivate_inactive_live_range +// CHECK-LIVE-RANGE: ========== Coalesced Live Ranges: +// CHECK-LIVE-RANGE: ^bb0: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: E cf.cond_br +// CHECK-LIVE-RANGE-NEXT: ^bb1: +// CHECK-LIVE-RANGE-NEXT: S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: | test.dummy +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: cf.br +// CHECK-LIVE-RANGE-NEXT: ^bb2: +// CHECK-LIVE-RANGE-NEXT: | S arm_sme.get_tile +// CHECK-LIVE-RANGE-NEXT: | | test.dummy +// CHECK-LIVE-RANGE-NEXT: | | test.dummy +// CHECK-LIVE-RANGE-NEXT: | E test.some_use +// CHECK-LIVE-RANGE-NEXT: E test.some_use +// CHECK-LIVE-RANGE-NEXT: cf.br + +// Here the live range for %tileA becomes inactive in bb1 (so %tileB gets tile +// ID 0 too). Then in bb2 the live range for tileA is reactivated as it overlaps +// with the start of %tileC's live range (which means %tileC gets tile ID 1). + +func.func @reactivate_inactive_live_range(%cond: i1) { + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + %tileA = arm_sme.get_tile : vector<[4]x[4]xf32> + cf.cond_br %cond, ^bb2, ^bb1 +^bb1: + // CHECK: arm_sme.get_tile {tile_id = 0 : i32} + %tileB = arm_sme.get_tile : vector<[16]x[16]xi8> + "test.dummy"(): () -> () + "test.some_use"(%tileB) : (vector<[16]x[16]xi8>) -> () + cf.br ^bb3 +^bb2: + // CHECK: arm_sme.get_tile {tile_id = 1 : i32} + %tileC = arm_sme.get_tile : vector<[4]x[4]xf32> + "test.dummy"(): () -> () + "test.dummy"(): () -> () + "test.some_use"(%tileC) : (vector<[4]x[4]xf32>) -> () + "test.some_use"(%tileA) : (vector<[4]x[4]xf32>) -> () + cf.br ^bb3 +^bb3: + return +} From e2c3cd7f3d0cd40bd8506ab305573d61a1ae25d9 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 18 Jul 2024 12:16:55 -0700 Subject: [PATCH 069/486] AMDGPU: Loop over the types for global_load_tr16 pats (NFC) (#99551) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 56cc6ffa19096..88c6039473338 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1589,15 +1589,13 @@ let OtherPredicates = [isGFX12Plus] in { let WaveSizePredicate = isWave32 in { defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; + foreach vt = [v8i16, v8f16, v8bf16] in + defm : GlobalFLATLoadPats ; } let WaveSizePredicate = isWave64 in { defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; - defm : GlobalFLATLoadPats ; + foreach vt = [v4i16, v4f16, v4bf16] in + defm : GlobalFLATLoadPats ; } } From 82cca0c77e935b4972c31745d94edef616970b6c Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Thu, 18 Jul 2024 12:32:13 -0700 Subject: [PATCH 070/486] [IR] Unify max alignment for arguments with generic max align. (#99257) The 2^14 limit was completely arbitrary; the generic limit is still arbitrary, but at least it's the same arbitrary limit as everything else. While I'm here, also add a verifier check for the ByValOrByRefSize. --- llvm/include/llvm/CodeGen/TargetCallingConv.h | 8 ++-- llvm/lib/IR/Verifier.cpp | 38 +++++++++++-------- llvm/test/Verifier/byval-size-limit.ll | 4 ++ llvm/test/Verifier/param-align.ll | 14 +++---- llvm/test/Verifier/param-attr-align.ll | 4 +- llvm/test/Verifier/param-ret-align.ll | 12 +++--- 6 files changed, 46 insertions(+), 34 deletions(-) create mode 100644 llvm/test/Verifier/byval-size-limit.ll diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h index 0ff4e1fe657f4..cb0055633f4f3 100644 --- a/llvm/include/llvm/CodeGen/TargetCallingConv.h +++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h @@ -45,9 +45,9 @@ namespace ISD { unsigned IsHva : 1; ///< HVA field for unsigned IsHvaStart : 1; ///< HVA structure start unsigned IsSecArgPass : 1; ///< Second argument - unsigned MemAlign : 4; ///< Log 2 of alignment when arg is passed in memory - ///< (including byval/byref). The max alignment is - ///< verified in IR verification. + unsigned MemAlign : 6; ///< Log 2 of alignment when arg is passed in memory + ///< (including byval/byref). The max alignment is + ///< verified in IR verification. unsigned OrigAlign : 5; ///< Log 2 of original alignment unsigned IsInConsecutiveRegsLast : 1; unsigned IsInConsecutiveRegs : 1; @@ -67,7 +67,7 @@ namespace ISD { IsSecArgPass(0), MemAlign(0), OrigAlign(0), IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0), IsCopyElisionCandidate(0), IsPointer(0) { - static_assert(sizeof(*this) == 3 * sizeof(unsigned), "flags are too big"); + static_assert(sizeof(*this) == 4 * sizeof(unsigned), "flags are too big"); } bool isZExt() const { return IsZExt; } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 75a53c1c99734..c5c407637cbf3 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -324,13 +324,6 @@ namespace { class Verifier : public InstVisitor, VerifierSupport { friend class InstVisitor; - - // ISD::ArgFlagsTy::MemAlign only have 4 bits for alignment, so - // the alignment size should not exceed 2^15. Since encode(Align) - // would plus the shift value by 1, the alignment size should - // not exceed 2^14, otherwise it can NOT be properly lowered - // in backend. - static constexpr unsigned ParamMaxAlignment = 1 << 14; DominatorTree DT; /// When verifying a basic block, keep track of all of the @@ -2021,31 +2014,43 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty, } if (isa(Ty)) { + if (Attrs.hasAttribute(Attribute::Alignment)) { + Align AttrAlign = Attrs.getAlignment().valueOrOne(); + Check(AttrAlign.value() <= Value::MaximumAlignment, + "huge alignment values are unsupported", V); + } if (Attrs.hasAttribute(Attribute::ByVal)) { - if (Attrs.hasAttribute(Attribute::Alignment)) { - Align AttrAlign = Attrs.getAlignment().valueOrOne(); - Align MaxAlign(ParamMaxAlignment); - Check(AttrAlign <= MaxAlign, - "Attribute 'align' exceed the max size 2^14", V); - } SmallPtrSet Visited; Check(Attrs.getByValType()->isSized(&Visited), "Attribute 'byval' does not support unsized types!", V); + Check(DL.getTypeAllocSize(Attrs.getByValType()).getKnownMinValue() < + (1ULL << 32), + "huge 'byval' arguments are unsupported", V); } if (Attrs.hasAttribute(Attribute::ByRef)) { SmallPtrSet Visited; Check(Attrs.getByRefType()->isSized(&Visited), "Attribute 'byref' does not support unsized types!", V); + Check(DL.getTypeAllocSize(Attrs.getByRefType()).getKnownMinValue() < + (1ULL << 32), + "huge 'byref' arguments are unsupported", V); } if (Attrs.hasAttribute(Attribute::InAlloca)) { SmallPtrSet Visited; Check(Attrs.getInAllocaType()->isSized(&Visited), "Attribute 'inalloca' does not support unsized types!", V); + Check(DL.getTypeAllocSize(Attrs.getInAllocaType()).getKnownMinValue() < + (1ULL << 32), + "huge 'inalloca' arguments are unsupported", V); } if (Attrs.hasAttribute(Attribute::Preallocated)) { SmallPtrSet Visited; Check(Attrs.getPreallocatedType()->isSized(&Visited), "Attribute 'preallocated' does not support unsized types!", V); + Check( + DL.getTypeAllocSize(Attrs.getPreallocatedType()).getKnownMinValue() < + (1ULL << 32), + "huge 'preallocated' arguments are unsupported", V); } } @@ -3511,12 +3516,15 @@ void Verifier::visitCallBase(CallBase &Call) { "not allowed. Please use the @llvm.amdgpu.cs.chain intrinsic instead.", Call); + // Disallow passing/returning values with alignment higher than we can + // represent. + // FIXME: Consider making DataLayout cap the alignment, so this isn't + // necessary. auto VerifyTypeAlign = [&](Type *Ty, const Twine &Message) { if (!Ty->isSized()) return; Align ABIAlign = DL.getABITypeAlign(Ty); - Align MaxAlign(ParamMaxAlignment); - Check(ABIAlign <= MaxAlign, + Check(ABIAlign.value() <= Value::MaximumAlignment, "Incorrect alignment of " + Message + " to called function!", Call); }; diff --git a/llvm/test/Verifier/byval-size-limit.ll b/llvm/test/Verifier/byval-size-limit.ll new file mode 100644 index 0000000000000..3eb462b063636 --- /dev/null +++ b/llvm/test/Verifier/byval-size-limit.ll @@ -0,0 +1,4 @@ +; RUN: not llvm-as < %s 2>&1 | FileCheck %s + +; CHECK: huge 'byval' arguments are unsupported +define void @f(ptr byval([2147483648 x i16])) { ret void } diff --git a/llvm/test/Verifier/param-align.ll b/llvm/test/Verifier/param-align.ll index bfd01cbc9faa5..caa8f9ac41ea5 100644 --- a/llvm/test/Verifier/param-align.ll +++ b/llvm/test/Verifier/param-align.ll @@ -2,19 +2,19 @@ ; Large vector for intrinsics is valid ; CHECK-NOT: llvm.fshr -define dso_local <8192 x i32> @test_intrin(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) { +define dso_local <2147483648 x i32> @test_intrin(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) { entry: - %b = call <8192 x i32> @llvm.fshr.v8192i32(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) - ret <8192 x i32> %b + %b = call <2147483648 x i32> @llvm.fshr.v8192i32(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) + ret <2147483648 x i32> %b } -declare <8192 x i32> @llvm.fshr.v8192i32 (<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) +declare <2147483648 x i32> @llvm.fshr.v8192i32 (<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) ; CHECK: Incorrect alignment of argument passed to called function! ; CHECK: bar -define dso_local void @foo(<8192 x float> noundef %vec) { +define dso_local void @foo(<2147483648 x float> noundef %vec) { entry: - call void @bar(<8192 x float> %vec) + call void @bar(<2147483648 x float> %vec) ret void } -declare dso_local void @bar(<8192 x float>) +declare dso_local void @bar(<2147483648 x float>) diff --git a/llvm/test/Verifier/param-attr-align.ll b/llvm/test/Verifier/param-attr-align.ll index 038bfa3494f89..700efe5376841 100644 --- a/llvm/test/Verifier/param-attr-align.ll +++ b/llvm/test/Verifier/param-attr-align.ll @@ -1,9 +1,9 @@ ; RUN: not llvm-as < %s 2>&1 | FileCheck %s -; CHECK: Attribute 'align' exceed the max size 2^14 +; CHECK: huge alignments are not supported yet define dso_local void @foo(ptr %p) { entry: - call void @bar(ptr noundef byval(<8 x float>) align 32768 %p) + call void @bar(ptr noundef byval(<8 x float>) align 8589934592 %p) ret void } diff --git a/llvm/test/Verifier/param-ret-align.ll b/llvm/test/Verifier/param-ret-align.ll index dd302c38b53d2..98cbb4ee88a89 100644 --- a/llvm/test/Verifier/param-ret-align.ll +++ b/llvm/test/Verifier/param-ret-align.ll @@ -2,19 +2,19 @@ ; Large vector for intrinsics is valid ; CHECK-NOT: llvm.fshr -define dso_local <8192 x i32> @test_intrin(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) { +define dso_local <2147483648 x i32> @test_intrin(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) { entry: - %b = call <8192 x i32> @llvm.fshr.v8192i32(<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) - ret <8192 x i32> %b + %b = call <2147483648 x i32> @llvm.fshr.v8192i32(<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) + ret <2147483648 x i32> %b } -declare <8192 x i32> @llvm.fshr.v8192i32 (<8192 x i32> %l, <8192 x i32> %r, <8192 x i32> %amt) +declare <2147483648 x i32> @llvm.fshr.v2147483648i32 (<2147483648 x i32> %l, <2147483648 x i32> %r, <2147483648 x i32> %amt) ; CHECK: Incorrect alignment of return type to called function! ; CHECK: bar define dso_local void @foo() { entry: - call <8192 x float> @bar() + call <2147483648 x float> @bar() ret void } -declare dso_local <8192 x float> @bar() +declare dso_local <2147483648 x float> @bar() From 892c58cf7490c219ff8fc4dc0d2497e062a9c665 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 18 Jul 2024 15:33:03 -0400 Subject: [PATCH 071/486] [Clang][AMDGPU] Add builtins for instrinsic `llvm.amdgcn.raw.ptr.buffer.load` (#99258) --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + clang/lib/CodeGen/CGBuiltin.cpp | 33 ++++ .../builtins-amdgcn-raw-buffer-load.cl | 172 ++++++++++++++++++ .../builtins-amdgcn-raw-buffer-load-error.cl | 33 ++++ 4 files changed, 244 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e62315eea277a..774cbaa74f8af 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -155,6 +155,12 @@ BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "viQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_store_b64, "vV2iQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_store_b96, "vV3iQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_store_b128, "vV4iQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b8, "UcQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b16, "UsQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b32, "UiQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n") +BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n") //===----------------------------------------------------------------------===// // Ballot builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 2ad62d6ee0bb2..f426570b17e15 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19185,6 +19185,39 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b128: return emitBuiltinWithOneOverloadedType<5>( *this, E, Intrinsic::amdgcn_raw_ptr_buffer_store); + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: { + llvm::Type *RetTy = nullptr; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b8: + RetTy = Int8Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b16: + RetTy = Int16Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b32: + RetTy = Int32Ty; + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b64: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/2); + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b96: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/3); + break; + case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_b128: + RetTy = llvm::FixedVectorType::get(Int32Ty, /*NumElements=*/4); + break; + } + Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_raw_ptr_buffer_load, RetTy); + return Builder.CreateCall( + F, {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), + EmitScalarExpr(E->getArg(2)), EmitScalarExpr(E->getArg(3))}); + } default: return nullptr; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl new file mode 100644 index 0000000000000..3403b69e07e4b --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl @@ -0,0 +1,172 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned int v2u32 __attribute__((ext_vector_type(2))); +typedef unsigned int v3u32 __attribute__((ext_vector_type(3))); +typedef unsigned int v4u32 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret i8 [[TMP0]] +// +u8 test_amdgcn_raw_ptr_buffer_load_b8(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret i16 [[TMP0]] +// +u16 test_amdgcn_raw_ptr_buffer_load_b16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +u32 test_amdgcn_raw_ptr_buffer_load_b32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// +v2u32 test_amdgcn_raw_ptr_buffer_load_b64(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <3 x i32> [[TMP0]] +// +v3u32 test_amdgcn_raw_ptr_buffer_load_b96(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_raw_ptr_buffer_load_b128(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret i8 [[TMP0]] +// +u8 test_amdgcn_raw_ptr_buffer_load_b8_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b8(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret i16 [[TMP0]] +// +u16 test_amdgcn_raw_ptr_buffer_load_b16_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b16(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +u32 test_amdgcn_raw_ptr_buffer_load_b32_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b32(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// +v2u32 test_amdgcn_raw_ptr_buffer_load_b64_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b64(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret <3 x i32> [[TMP0]] +// +v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b96(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128_non_const_offset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 [[OFFSET:%.*]], i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_offset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b128(rsrc, offset, /*soffset=*/0, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b8_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret i8 [[TMP0]] +// +u8 test_amdgcn_raw_ptr_buffer_load_b8_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b16_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret i16 [[TMP0]] +// +u16 test_amdgcn_raw_ptr_buffer_load_b16_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b32_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +u32 test_amdgcn_raw_ptr_buffer_load_b32_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b64_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// +v2u32 test_amdgcn_raw_ptr_buffer_load_b64_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v3i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <3 x i32> [[TMP0]] +// +v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) [[RSRC:%.*]], i32 0, i32 [[SOFFSET:%.*]], i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { + return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, soffset, /*aux=*/0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl new file mode 100644 index 0000000000000..5d123c8e81d87 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-buffer-load-error.cl @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef unsigned char u8; +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned int v2u32 __attribute__((ext_vector_type(2))); +typedef unsigned int v3u32 __attribute__((ext_vector_type(3))); +typedef unsigned int v4u32 __attribute__((ext_vector_type(4))); + +u8 test_amdgcn_raw_ptr_buffer_load_b8(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b8(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b8' must be a constant integer}} +} + +u16 test_amdgcn_raw_ptr_buffer_load_b16(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b16(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b16' must be a constant integer}} +} + +u32 test_amdgcn_raw_ptr_buffer_load_b32(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b32(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b32' must be a constant integer}} +} + +v2u32 test_amdgcn_raw_ptr_buffer_load_b64(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b64(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b64' must be a constant integer}} +} + +v3u32 test_amdgcn_raw_ptr_buffer_load_b96(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b96(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b96' must be a constant integer}} +} + +v4u32 test_amdgcn_raw_ptr_buffer_load_b128(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset, int aux) { + return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, /*soffset=*/0, aux); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_b128' must be a constant integer}} +} From 52d947b5c14173b0aee96e419a04a49f83e5a283 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Jul 2024 12:53:36 -0700 Subject: [PATCH 072/486] [LV] Remove unnecessary variable from InnerLoopVectorizer::createBitOrPointerCast. NFC DstVTy is already a VectorType, we don't need to cast it again. This used to be a cast to FixedVectorType that was changed to support scalable vectors. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 748db418fee8c..fbca4cdcbcfcd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2767,18 +2767,17 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, const DataLayout &DL) { // Verify that V is a vector type with same number of elements as DstVTy. - auto *DstFVTy = cast(DstVTy); - auto VF = DstFVTy->getElementCount(); + auto VF = DstVTy->getElementCount(); auto *SrcVecTy = cast(V->getType()); assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match"); Type *SrcElemTy = SrcVecTy->getElementType(); - Type *DstElemTy = DstFVTy->getElementType(); + Type *DstElemTy = DstVTy->getElementType(); assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && "Vector elements must have same size"); // Do a direct cast if element types are castable. if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) { - return Builder.CreateBitOrPointerCast(V, DstFVTy); + return Builder.CreateBitOrPointerCast(V, DstVTy); } // V cannot be directly casted to desired vector type. // May happen when V is a floating point vector but DstVTy is a vector of @@ -2792,7 +2791,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy)); auto *VecIntTy = VectorType::get(IntTy, VF); Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy); - return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); + return Builder.CreateBitOrPointerCast(CastVal, DstVTy); } void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { From eb7d54a84bd6b8f85ceb94d96d09b50b494a3f9c Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Thu, 18 Jul 2024 12:58:10 -0700 Subject: [PATCH 073/486] [msan] Precommit MSan Arm NEON vst tests with origin-tracking (#99555) This adds an abridged copy of neon_vst.ll (from https://github.com/llvm/llvm-project/commit/ff0821583eab1651ff126bbf4f881e6163b67435), but with origin tracking enabled. The test will be updated when MSan's Arm NEON support is improved (e.g., https://github.com/llvm/llvm-project/pull/99360). --- .../AArch64/neon_vst_origins.ll | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll new file mode 100644 index 0000000000000..ff4c4c24bf37a --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_origins.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; Test memory sanitizer instrumentation for Arm NEON VST instructions, with +; origin tracking. These tests are deliberately shorter than neon_vst.ll, due +; to the verbosity of the output. +; +; RUN: opt < %s -passes=msan -msan-track-origins=2 -S | FileCheck %s +; +; Forked from llvm/test/CodeGen/AArch64/arm64-st1.ll + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-android9001" + +; ----------------------------------------------------------------------------------------------------------------------------------------------- + +define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memory { +; +; CHECK-LABEL: define void @st2_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: 8: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]] +; CHECK: 11: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 12: +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> %A, <16 x i8> %B, ptr %P) + ret void +} + +define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind sanitize_memory { +; +; CHECK-LABEL: define void @st3_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF0]] +; CHECK: 10: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]] +; CHECK: 13: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP5]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF0]] +; CHECK: 16: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 17: +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0 +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]] +; CHECK: 18: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 19: +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) + ret void +} + +define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) nounwind sanitize_memory { +; +; CHECK-LABEL: define void @st4_16b +; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP1]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0 +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF0]] +; CHECK: 12: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0 +; CHECK-NEXT: br i1 [[_MSCMP1]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF0]] +; CHECK: 15: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP4]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 16: +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i8> [[TMP5]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]] +; CHECK: 18: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 19: +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i8> [[TMP7]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[_MSCMP3]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF0]] +; CHECK: 21: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP8]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 22: +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i64 [[TMP9]], 0 +; CHECK-NEXT: br i1 [[_MSCMP4]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0]] +; CHECK: 23: +; CHECK-NEXT: call void @__msan_warning_with_origin_noreturn(i32 [[TMP10]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 24: +; CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]], <16 x i8> [[D]], ptr [[P]]) +; CHECK-NEXT: ret void +; + call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %P) + ret void +} + +declare void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly +declare void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, ptr) nounwind sanitize_memory readonly From a0662176a9b40462aafbb17cd8eb8cf6a65e940e Mon Sep 17 00:00:00 2001 From: Iuri Chaer Date: Thu, 18 Jul 2024 21:11:24 +0100 Subject: [PATCH 074/486] [libc++] Speed up set_intersection() by fast-forwarding over ranges of non-matching elements with one-sided binary search. (#75230) One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general advantage of being constant time in the best case, with the downside of executing at most 2*log(N) comparisons vs classic binary search's exact log(N). There are two scenarios in which it really shines: the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the container's size upfront, which adds N iterator increments to the complexity. The second one is when traversing the container in order, trying to fast-forward to the next value: in that case the classic algorithm requires at least O(N*log(N)) comparisons and, for non-random-access iterators, O(N^2) iterator increments, whereas the one-sided version will yield O(N) operations on both counts, with a best-case of O(log(N)) comparisons which is very common in practice. --- libcxx/benchmarks/CMakeLists.txt | 1 + .../algorithms/set_intersection.bench.cpp | 184 ++++++++ libcxx/docs/ReleaseNotes/19.rst | 4 + .../include/__algorithm/iterator_operations.h | 54 +++ libcxx/include/__algorithm/lower_bound.h | 54 ++- libcxx/include/__algorithm/set_intersection.h | 122 +++++- .../ranges_set_intersection.pass.cpp | 71 +-- .../set_intersection_complexity.pass.cpp | 404 ++++++++++++++++++ .../iterator_count.pass.cpp | 11 +- .../iterator_count_sentinel.pass.cpp | 63 +-- .../iterator_sentinel.pass.cpp | 28 +- .../range.adaptors/range.drop/begin.pass.cpp | 8 +- .../ranges/range.adaptors/range.drop/types.h | 10 +- .../range.adaptors/range.transform/types.h | 6 - libcxx/test/support/test_iterators.h | 184 ++++---- 15 files changed, 985 insertions(+), 219 deletions(-) create mode 100644 libcxx/benchmarks/algorithms/set_intersection.bench.cpp create mode 100644 libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt index 110672600213a..d96ccc1e49f66 100644 --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -135,6 +135,7 @@ set(BENCHMARK_TESTS algorithms/ranges_sort.bench.cpp algorithms/ranges_sort_heap.bench.cpp algorithms/ranges_stable_sort.bench.cpp + algorithms/set_intersection.bench.cpp algorithms/sort.bench.cpp algorithms/sort_heap.bench.cpp algorithms/stable_sort.bench.cpp diff --git a/libcxx/benchmarks/algorithms/set_intersection.bench.cpp b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp new file mode 100644 index 0000000000000..b3fb15fc77b31 --- /dev/null +++ b/libcxx/benchmarks/algorithms/set_intersection.bench.cpp @@ -0,0 +1,184 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "common.h" +#include "test_iterators.h" + +namespace { + +// types of containers we'll want to test, covering interesting iterator types +struct VectorContainer { + template + using type = std::vector; + + static constexpr const char* Name = "Vector"; +}; + +struct SetContainer { + template + using type = std::set; + + static constexpr const char* Name = "Set"; +}; + +using AllContainerTypes = std::tuple; + +// set_intersection performance may depend on where matching values lie +enum class OverlapPosition { + None, + Front, + // performance-wise, matches at the back are identical to ones at the front + Interlaced, +}; + +struct AllOverlapPositions : EnumValuesAsTuple { + static constexpr const char* Names[] = {"None", "Front", "Interlaced"}; +}; + +// forward_iterator wrapping which, for each increment, moves the underlying iterator forward Stride elements +template +struct StridedFwdIt { + Wrapped base_; + unsigned stride_; + + using iterator_category = std::forward_iterator_tag; + using difference_type = typename Wrapped::difference_type; + using value_type = typename Wrapped::value_type; + using pointer = typename Wrapped::pointer; + using reference = typename Wrapped::reference; + + StridedFwdIt(Wrapped base, unsigned stride) : base_(base), stride_(stride) { assert(stride_ != 0); } + + StridedFwdIt operator++() { + for (unsigned i = 0; i < stride_; ++i) + ++base_; + return *this; + } + StridedFwdIt operator++(int) { + auto tmp = *this; + ++*this; + return tmp; + } + value_type& operator*() { return *base_; } + const value_type& operator*() const { return *base_; } + value_type& operator->() { return *base_; } + const value_type& operator->() const { return *base_; } + bool operator==(const StridedFwdIt& o) const { return base_ == o.base_; } + bool operator!=(const StridedFwdIt& o) const { return !operator==(o); } +}; +template +StridedFwdIt(Wrapped, unsigned) -> StridedFwdIt; + +template +std::vector getVectorOfRandom(size_t N) { + std::vector v; + fillValues(v, N, Order::Random); + sortValues(v, Order::Random); + return std::vector(v); +} + +// Realistically, data won't all be nicely contiguous in a container, +// we'll go through some effort to ensure that it's shuffled through memory +// this is especially important for containers with non-contiguous element +// storage, but it will affect even a std::vector, because when you copy a +// std::vector the underlying data storage position for the char +// arrays of the copy are likely to have high locality +template +std::pair genCacheUnfriendlyData(size_t size1, size_t size2, OverlapPosition pos) { + using ValueType = typename Container::value_type; + auto move_into = [](auto first, auto last) { + Container out; + std::move(first, last, std::inserter(out, out.begin())); + return out; + }; + const auto src_size = pos == OverlapPosition::None ? size1 + size2 : std::max(size1, size2); + std::vector src = getVectorOfRandom(src_size); + + if (pos == OverlapPosition::None) { + std::sort(src.begin(), src.end()); + return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(src.begin() + size1, src.end())); + } + + // All other overlap types will have to copy some part of the data, but if + // we copy after sorting it will likely have high locality, so we sort + // each copy separately + auto copy = src; + std::sort(src.begin(), src.end()); + std::sort(copy.begin(), copy.end()); + + switch (pos) { + case OverlapPosition::None: + // we like -Wswitch :) + break; + + case OverlapPosition::Front: + return std::make_pair(move_into(src.begin(), src.begin() + size1), move_into(copy.begin(), copy.begin() + size2)); + + case OverlapPosition::Interlaced: + const auto stride1 = size1 < size2 ? size2 / size1 : 1; + const auto stride2 = size2 < size1 ? size1 / size2 : 1; + return std::make_pair(move_into(StridedFwdIt(src.begin(), stride1), StridedFwdIt(src.end(), stride1)), + move_into(StridedFwdIt(copy.begin(), stride2), StridedFwdIt(copy.end(), stride2))); + } + std::abort(); // would be std::unreachable() if it could + return std::pair(); +} + +template +struct SetIntersection { + using ContainerType = typename Container::template type>; + size_t size1_; + size_t size2_; + + SetIntersection(size_t size1, size_t size2) : size1_(size1), size2_(size2) {} + + bool skip() const noexcept { + // let's save some time and skip simmetrical runs + return size1_ < size2_; + } + + void run(benchmark::State& state) const { + auto input = genCacheUnfriendlyData(size1_, size2_, Overlap()); + std::vector> out(std::min(size1_, size2_)); + + const auto BATCH_SIZE = std::max(size_t{512}, (2 * TestSetElements) / (size1_ + size2_)); + for (const auto& _ : state) { + while (state.KeepRunningBatch(BATCH_SIZE)) { + for (unsigned i = 0; i < BATCH_SIZE; ++i) { + const auto& [c1, c2] = input; + auto res = std::set_intersection(c1.begin(), c1.end(), c2.begin(), c2.end(), out.begin()); + benchmark::DoNotOptimize(res); + } + } + } + } + + std::string name() const { + return std::string("SetIntersection") + Overlap::name() + '_' + Container::Name + ValueType::name() + '_' + + std::to_string(size1_) + '_' + std::to_string(size2_); + } +}; + +} // namespace + +int main(int argc, char** argv) { /**/ + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) + return 1; + + makeCartesianProductBenchmark( + Quantities, Quantities); + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 624550f998858..ccafa1a1456e0 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -71,6 +71,10 @@ Improvements and New Features - The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of up to 100x. +- The ``std::set_intersection`` and ``std::ranges::set_intersection`` algorithms have been optimized to fast-forward over + contiguous ranges of non-matching values, reducing the number of comparisons from linear to + logarithmic growth with the number of elements in best-case scenarios. + - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM`` macro has been added to make the declarations in ```` available. - The ``_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT`` macro has been added to make the declarations in ```` diff --git a/libcxx/include/__algorithm/iterator_operations.h b/libcxx/include/__algorithm/iterator_operations.h index 5cf13f0a3f292..8ced989233bc4 100644 --- a/libcxx/include/__algorithm/iterator_operations.h +++ b/libcxx/include/__algorithm/iterator_operations.h @@ -11,6 +11,7 @@ #include <__algorithm/iter_swap.h> #include <__algorithm/ranges_iterator_concept.h> +#include <__assert> #include <__config> #include <__iterator/advance.h> #include <__iterator/distance.h> @@ -160,6 +161,59 @@ struct _IterOps<_ClassicAlgPolicy> { _LIBCPP_HIDE_FROM_ABI static _LIBCPP_CONSTEXPR_SINCE_CXX14 void __advance_to(_Iter& __first, _Iter __last) { __first = __last; } + + // advance with sentinel, a la std::ranges::advance + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_Iter> + __advance_to(_Iter& __iter, __difference_type<_Iter> __count, const _Iter& __sentinel) { + return _IterOps::__advance_to(__iter, __count, __sentinel, typename iterator_traits<_Iter>::iterator_category()); + } + +private: + // advance with sentinel, a la std::ranges::advance -- InputIterator specialization + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_InputIter> __advance_to( + _InputIter& __iter, __difference_type<_InputIter> __count, const _InputIter& __sentinel, input_iterator_tag) { + __difference_type<_InputIter> __dist = 0; + for (; __dist < __count && __iter != __sentinel; ++__dist) + ++__iter; + return __count - __dist; + } + + // advance with sentinel, a la std::ranges::advance -- BidirectionalIterator specialization + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_BiDirIter> + __advance_to(_BiDirIter& __iter, + __difference_type<_BiDirIter> __count, + const _BiDirIter& __sentinel, + bidirectional_iterator_tag) { + __difference_type<_BiDirIter> __dist = 0; + if (__count >= 0) + for (; __dist < __count && __iter != __sentinel; ++__dist) + ++__iter; + else + for (__count = -__count; __dist < __count && __iter != __sentinel; ++__dist) + --__iter; + return __count - __dist; + } + + // advance with sentinel, a la std::ranges::advance -- RandomIterator specialization + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 static __difference_type<_RandIter> + __advance_to(_RandIter& __iter, + __difference_type<_RandIter> __count, + const _RandIter& __sentinel, + random_access_iterator_tag) { + auto __dist = _IterOps::distance(__iter, __sentinel); + _LIBCPP_ASSERT_VALID_INPUT_RANGE( + __count == 0 || (__dist < 0) == (__count < 0), "__sentinel must precede __iter when __count < 0"); + if (__count < 0) + __dist = __dist > __count ? __dist : __count; + else + __dist = __dist < __count ? __dist : __count; + __iter += __dist; + return __count - __dist; + } }; _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h index 8fd355a7cfc4a..c417d84835497 100644 --- a/libcxx/include/__algorithm/lower_bound.h +++ b/libcxx/include/__algorithm/lower_bound.h @@ -27,11 +27,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter -__lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) { - auto __len = _IterOps<_AlgPolicy>::distance(__first, __last); - +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter __lower_bound_bisecting( + _Iter __first, + const _Type& __value, + typename iterator_traits<_Iter>::difference_type __len, + _Comp& __comp, + _Proj& __proj) { while (__len != 0) { auto __l2 = std::__half_positive(__len); _Iter __m = __first; @@ -46,6 +48,48 @@ __lower_bound(_Iter __first, _Sent __last, const _Type& __value, _Comp& __comp, return __first; } +// One-sided binary search, aka meta binary search, has been in the public domain for decades, and has the general +// advantage of being \Omega(1) rather than the classic algorithm's \Omega(log(n)), with the downside of executing at +// most 2*log(n) comparisons vs the classic algorithm's exact log(n). There are two scenarios in which it really shines: +// the first one is when operating over non-random-access iterators, because the classic algorithm requires knowing the +// container's size upfront, which adds \Omega(n) iterator increments to the complexity. The second one is when you're +// traversing the container in order, trying to fast-forward to the next value: in that case, the classic algorithm +// would yield \Omega(n*log(n)) comparisons and, for non-random-access iterators, \Omega(n^2) iterator increments, +// whereas the one-sided version will yield O(n) operations on both counts, with a \Omega(log(n)) bound on the number of +// comparisons. +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +__lower_bound_onesided(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) { + // step = 0, ensuring we can always short-circuit when distance is 1 later on + if (__first == __last || !std::__invoke(__comp, std::__invoke(__proj, *__first), __value)) + return __first; + + using _Distance = typename iterator_traits<_ForwardIterator>::difference_type; + for (_Distance __step = 1; __first != __last; __step <<= 1) { + auto __it = __first; + auto __dist = __step - _IterOps<_AlgPolicy>::__advance_to(__it, __step, __last); + // once we reach the last range where needle can be we must start + // looking inwards, bisecting that range + if (__it == __last || !std::__invoke(__comp, std::__invoke(__proj, *__it), __value)) { + // we've already checked the previous value and it was less, we can save + // one comparison by skipping bisection + if (__dist == 1) + return __it; + return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj); + } + // range not found, move forward! + __first = __it; + } + return __first; +} + +template +_LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator +__lower_bound(_ForwardIterator __first, _Sent __last, const _Type& __value, _Comp& __comp, _Proj& __proj) { + const auto __dist = _IterOps<_AlgPolicy>::distance(__first, __last); + return std::__lower_bound_bisecting<_AlgPolicy>(__first, __value, __dist, __comp, __proj); +} + template _LIBCPP_NODISCARD inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, _Compare __comp) { diff --git a/libcxx/include/__algorithm/set_intersection.h b/libcxx/include/__algorithm/set_intersection.h index 73d888d1b0384..bb0d86cd0f58d 100644 --- a/libcxx/include/__algorithm/set_intersection.h +++ b/libcxx/include/__algorithm/set_intersection.h @@ -12,10 +12,15 @@ #include <__algorithm/comp.h> #include <__algorithm/comp_ref_type.h> #include <__algorithm/iterator_operations.h> +#include <__algorithm/lower_bound.h> #include <__config> +#include <__functional/identity.h> #include <__iterator/iterator_traits.h> #include <__iterator/next.h> +#include <__type_traits/is_same.h> +#include <__utility/exchange.h> #include <__utility/move.h> +#include <__utility/swap.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -38,10 +43,103 @@ struct __set_intersection_result { : __in1_(std::move(__in_iter1)), __in2_(std::move(__in_iter2)), __out_(std::move(__out_iter)) {} }; -template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter> +// Helper for __set_intersection() with one-sided binary search: populate result and advance input iterators if they +// are found to potentially contain the same value in two consecutive calls. This function is very intimately related to +// the way it is used and doesn't attempt to abstract that, it's not appropriate for general usage outside of its +// context. +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __set_intersection_add_output_if_equal( + bool __may_be_equal, + _InForwardIter1& __first1, + _InForwardIter2& __first2, + _OutIter& __result, + bool& __prev_may_be_equal) { + if (__may_be_equal && __prev_may_be_equal) { + *__result = *__first1; + ++__result; + ++__first1; + ++__first2; + __prev_may_be_equal = false; + } else { + __prev_may_be_equal = __may_be_equal; + } +} + +// With forward iterators we can make multiple passes over the data, allowing the use of one-sided binary search to +// reduce best-case complexity to log(N). Understanding how we can use binary search and still respect complexity +// guarantees is _not_ straightforward: the guarantee is "at most 2*(N+M)-1 comparisons", and one-sided binary search +// will necessarily overshoot depending on the position of the needle in the haystack -- for instance, if we're +// searching for 3 in (1, 2, 3, 4), we'll check if 3<1, then 3<2, then 3<4, and, finally, 3<3, for a total of 4 +// comparisons, when linear search would have yielded 3. However, because we won't need to perform the intervening +// reciprocal comparisons (ie 1<3, 2<3, 4<3), that extra comparison doesn't run afoul of the guarantee. Additionally, +// this type of scenario can only happen for match distances of up to 5 elements, because 2*log2(8) is 6, and we'll +// still be worse-off at position 5 of an 8-element set. From then onwards these scenarios can't happen. TL;DR: we'll be +// 1 comparison worse-off compared to the classic linear-searching algorithm if matching position 3 of a set with 4 +// elements, or position 5 if the set has 7 or 8 elements, but we'll never exceed the complexity guarantees from the +// standard. +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI +_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter> __set_intersection( - _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) { + _InForwardIter1 __first1, + _Sent1 __last1, + _InForwardIter2 __first2, + _Sent2 __last2, + _OutIter __result, + _Compare&& __comp, + std::forward_iterator_tag, + std::forward_iterator_tag) { + _LIBCPP_CONSTEXPR std::__identity __proj; + bool __prev_may_be_equal = false; + + while (__first2 != __last2) { + _InForwardIter1 __first1_next = + std::__lower_bound_onesided<_AlgPolicy>(__first1, __last1, *__first2, __comp, __proj); + std::swap(__first1_next, __first1); + // keeping in mind that a==b iff !(a(__first2, __last2, *__first1, __comp, __proj); + std::swap(__first2_next, __first2); + std::__set_intersection_add_output_if_equal( + __first2 == __first2_next, __first1, __first2, __result, __prev_may_be_equal); + } + return __set_intersection_result<_InForwardIter1, _InForwardIter2, _OutIter>( + _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)), + _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)), + std::move(__result)); +} + +// input iterators are not suitable for multipass algorithms, so we stick to the classic single-pass version +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI +_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter> +__set_intersection( + _InInputIter1 __first1, + _Sent1 __last1, + _InInputIter2 __first2, + _Sent2 __last2, + _OutIter __result, + _Compare&& __comp, + std::input_iterator_tag, + std::input_iterator_tag) { while (__first1 != __last1 && __first2 != __last2) { if (__comp(*__first1, *__first2)) ++__first1; @@ -55,12 +153,28 @@ __set_intersection( } } - return __set_intersection_result<_InIter1, _InIter2, _OutIter>( + return __set_intersection_result<_InInputIter1, _InInputIter2, _OutIter>( _IterOps<_AlgPolicy>::next(std::move(__first1), std::move(__last1)), _IterOps<_AlgPolicy>::next(std::move(__first2), std::move(__last2)), std::move(__result)); } +template +_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI +_LIBCPP_CONSTEXPR_SINCE_CXX20 __set_intersection_result<_InIter1, _InIter2, _OutIter> +__set_intersection( + _InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Sent2 __last2, _OutIter __result, _Compare&& __comp) { + return std::__set_intersection<_AlgPolicy>( + std::move(__first1), + std::move(__last1), + std::move(__first2), + std::move(__last2), + std::move(__result), + std::forward<_Compare>(__comp), + typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter1>(), + typename std::_IterOps<_AlgPolicy>::template __iterator_category<_InIter2>()); +} + template inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator set_intersection( _InputIterator1 __first1, diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp index 5323bb1bc1193..f7870485cfefc 100644 --- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/ranges_set_intersection.pass.cpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include "almost_satisfies_types.h" #include "MoveOnly.h" @@ -463,75 +465,6 @@ constexpr bool test() { } } - // Complexity: At most 2 * ((last1 - first1) + (last2 - first2)) - 1 comparisons and applications of each projection. - { - std::array r1{{{1}, {3}, {5}, {7}, {9}}}; - std::array r2{{{2}, {4}, {6}, {8}, {10}}}; - std::array expected{}; - - const std::size_t maxOperation = 2 * (r1.size() + r2.size()) - 1; - - // iterator overload - { - std::array out{}; - std::size_t numberOfComp = 0; - std::size_t numberOfProj1 = 0; - std::size_t numberOfProj2 = 0; - - const auto comp = [&numberOfComp](int x, int y) { - ++numberOfComp; - return x < y; - }; - - const auto proj1 = [&numberOfProj1](const Data& d) { - ++numberOfProj1; - return d.data; - }; - - const auto proj2 = [&numberOfProj2](const Data& d) { - ++numberOfProj2; - return d.data; - }; - - std::ranges::set_intersection(r1.begin(), r1.end(), r2.begin(), r2.end(), out.data(), comp, proj1, proj2); - - assert(std::ranges::equal(out, expected, {}, &Data::data)); - assert(numberOfComp < maxOperation); - assert(numberOfProj1 < maxOperation); - assert(numberOfProj2 < maxOperation); - } - - // range overload - { - std::array out{}; - std::size_t numberOfComp = 0; - std::size_t numberOfProj1 = 0; - std::size_t numberOfProj2 = 0; - - const auto comp = [&numberOfComp](int x, int y) { - ++numberOfComp; - return x < y; - }; - - const auto proj1 = [&numberOfProj1](const Data& d) { - ++numberOfProj1; - return d.data; - }; - - const auto proj2 = [&numberOfProj2](const Data& d) { - ++numberOfProj2; - return d.data; - }; - - std::ranges::set_intersection(r1, r2, out.data(), comp, proj1, proj2); - - assert(std::ranges::equal(out, expected, {}, &Data::data)); - assert(numberOfComp < maxOperation); - assert(numberOfProj1 < maxOperation); - assert(numberOfProj2 < maxOperation); - } - } - // Comparator convertible to bool { struct ConvertibleToBool { diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp new file mode 100644 index 0000000000000..ddf4087ddd6cd --- /dev/null +++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.intersection/set_intersection_complexity.pass.cpp @@ -0,0 +1,404 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// Algorithmic complexity tests for both std::set_intersection and std::ranges::set_intersection + +// template +// requires OutputIterator +// && OutputIterator +// && HasLess +// && HasLess +// constexpr OutIter // constexpr after C++17 +// set_intersection(InIter1 first1, InIter1 last1, InIter2 first2, InIter2 last2, +// OutIter result); +// +// template S1, input_iterator I2, sentinel_for S2, +// weakly_incrementable O, class Comp = ranges::less, +// class Proj1 = identity, class Proj2 = identity> +// requires mergeable +// constexpr set_intersection_result +// set_intersection(I1 first1, S1 last1, I2 first2, S2 last2, O result, +// Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {}); // since C++20 +// +// template +// requires mergeable, iterator_t, O, Comp, Proj1, Proj2> +// constexpr set_intersection_result, borrowed_iterator_t, O> +// set_intersection(R1&& r1, R2&& r2, O result, +// Comp comp = {}, Proj1 proj1 = {}, Proj2 proj2 = {}); // since C++20 + +#include +#include +#include +#include + +#include "test_iterators.h" + +namespace { + +// __debug_less will perform an additional comparison in an assertion +static constexpr unsigned std_less_comparison_count_multiplier() noexcept { +#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG + return 2; +#else + return 1; +#endif +} + +struct [[nodiscard]] OperationCounts { + std::size_t comparisons{}; + struct PerInput { + std::size_t proj{}; + IteratorOpCounts iterops; + + [[nodiscard]] constexpr bool isNotBetterThan(const PerInput& other) { + return proj >= other.proj && iterops.increments + iterops.decrements + iterops.zero_moves >= + other.iterops.increments + other.iterops.decrements + other.iterops.zero_moves; + } + }; + std::array in; + + [[nodiscard]] constexpr bool isNotBetterThan(const OperationCounts& expect) { + return std_less_comparison_count_multiplier() * comparisons >= expect.comparisons && + in[0].isNotBetterThan(expect.in[0]) && in[1].isNotBetterThan(expect.in[1]); + } +}; + +template +struct counted_set_intersection_result { + std::array result; + OperationCounts opcounts; + + constexpr counted_set_intersection_result() = default; + + constexpr explicit counted_set_intersection_result(std::array&& contents) : result{contents} {} + + constexpr void assertNotBetterThan(const counted_set_intersection_result& other) { + assert(result == other.result); + assert(opcounts.isNotBetterThan(other.opcounts)); + } +}; + +template +counted_set_intersection_result(std::array) -> counted_set_intersection_result; + +template