Skip to content

Commit bfb9b8e

Browse files
committed
[Passes] add a tail-call-elim pass near the end of the opt pipeline
We call tail-call-elim near the beginning of the pipeline, but that is too early to annotate calls that get added later. In the motivating case from issue #47852, the missing 'tail' on memset leads to sub-optimal codegen. I experimented with removing the early instance of tail-call-elim instead of just adding another pass, but that appears to be slightly worse for compile-time: +0.15% vs. +0.08% time. "tailcall" shows adding the pass; "tailcall2" shows moving the pass to later, then adding the original early pass back (so 1596886802 is functionally equivalent to 180b0439dc ): https://llvm-compile-time-tracker.com/index.php?config=NewPM-O3&stat=instructions&remote=rotateright Note that there was an effort to split the tail call functionality into 2 passes - that could help reduce compile-time if we find that this change costs more in compile-time than expected based on the preliminary testing: D60031 Differential Revision: https://reviews.llvm.org/D130374
1 parent 95f4ca7 commit bfb9b8e

File tree

455 files changed

+20939
-20952
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

455 files changed

+20939
-20952
lines changed

clang/test/CodeGen/aarch64-ls64-inline-asm.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ struct foo { unsigned long long x[8]; };
55

66
// CHECK-LABEL: @load(
77
// CHECK-NEXT: entry:
8-
// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
8+
// CHECK-NEXT: [[TMP0:%.*]] = tail call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6
99
// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512*
1010
// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8
1111
// CHECK-NEXT: ret void
@@ -19,7 +19,7 @@ void load(struct foo *output, void *addr)
1919
// CHECK-NEXT: entry:
2020
// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512*
2121
// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8
22-
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
22+
// CHECK-NEXT: tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7
2323
// CHECK-NEXT: ret void
2424
//
2525
void store(const struct foo *input, void *addr)
@@ -74,7 +74,7 @@ void store(const struct foo *input, void *addr)
7474
// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512
7575
// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]]
7676
// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]]
77-
// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
77+
// CHECK-NEXT: tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12
7878
// CHECK-NEXT: ret void
7979
//
8080
void store2(int *in, void *addr)

clang/test/CodeGen/aarch64-neon-vcmla.c

Lines changed: 52 additions & 52 deletions
Large diffs are not rendered by default.

clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,20 @@ vec2048 x2048 = {0, 1, 2, 3, 3 , 2 , 1, 0, 0, 1, 2, 3, 3 , 2 , 1, 0,
5252
typedef int8_t vec_int8 __attribute__((vector_size(N / 8)));
5353
// CHECK128-LABEL: define{{.*}} <16 x i8> @f2(<16 x i8> noundef %x)
5454
// CHECK128-NEXT: entry:
55-
// CHECK128-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
56-
// CHECK128-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> [[X:%.*]], i64 0)
57-
// CHECK128-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
58-
// CHECK128-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP1]], i64 0)
55+
// CHECK128-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
56+
// CHECK128-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> [[X:%.*]], i64 0)
57+
// CHECK128-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
58+
// CHECK128-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[TMP1]], i64 0)
5959
// CHECK128-NEXT: ret <16 x i8> [[CASTFIXEDSVE]]
6060

6161
// CHECK-LABEL: define{{.*}} void @f2(
6262
// CHECK-SAME: <[[#div(VBITS,8)]] x i8>* noalias nocapture writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 %agg.result, <[[#div(VBITS,8)]] x i8>* nocapture noundef readonly %0)
6363
// CHECK-NEXT: entry:
6464
// CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, <[[#div(VBITS,8)]] x i8>* [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]]
65-
// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
66-
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
67-
// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
68-
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
65+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
66+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8(<vscale x 16 x i8> undef, <[[#div(VBITS,8)]] x i8> [[X]], i64 0)
67+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> [[TMP1]], <vscale x 16 x i8> [[CASTSCALABLESVE]], i32 1)
68+
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[TMP2]], i64 0)
6969
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[CASTFIXEDSVE]], <[[#div(VBITS,8)]] x i8>* [[AGG_RESULT:%.*]], align 16, [[TBAA6]]
7070
// CHECK-NEXT: ret void
7171
vec_int8 f2(vec_int8 x) { return svasrd_x(svptrue_b8(), x, 1); }
@@ -80,14 +80,14 @@ typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N)));
8080

8181
// CHECK128-LABEL: define{{.*}} void @g(<vscale x 16 x i8> noundef %x.coerce)
8282
// CHECK128-NEXT: entry:
83-
// CHECK128-NEXT: [[X:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
83+
// CHECK128-NEXT: [[X:%.*]] = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
8484
// CHECK128-NEXT: call void @f3(<16 x i8> noundef [[X]]) [[ATTR5:#.*]]
8585
// CHECK128-NEXT: ret void
8686

8787
// CHECK-LABEL: define{{.*}} void @g(<vscale x 16 x i8> noundef %x.coerce)
8888
// CHECK-NEXT: entry:
8989
// CHECK-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS,8)]] x i8>, align 16
90-
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
90+
// CHECK-NEXT: [[X:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8(<vscale x 16 x i8> [[X_COERCE:%.*]], i64 0)
9191
// CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X]], <[[#div(VBITS,8)]] x i8>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]]
9292
// CHECK-NEXT: call void @f3(<[[#div(VBITS,8)]] x i8>* noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
9393
// CHECK-NEXT: ret void

clang/test/CodeGen/aarch64-sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ void test02() {
4949
// CHECK-SAME: [[#VBITS]]
5050
// CHECK-SAME: EES_(<vscale x 4 x i32> noundef %x.coerce, <vscale x 4 x i32> noundef %y.coerce)
5151
// CHECK-NEXT: entry:
52-
// CHECK-NEXT: [[X:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
53-
// CHECK-NEXT: [[Y:%.*]] = call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
52+
// CHECK-NEXT: [[X:%.*]] = tail call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE:%.*]], i64 0)
53+
// CHECK-NEXT: [[Y:%.*]] = tail call <[[#div(VBITS, 32)]] x i32> @llvm.vector.extract.v[[#div(VBITS, 32)]]i32.nxv4i32(<vscale x 4 x i32> [[X_COERCE1:%.*]], i64 0)
5454
// CHECK-NEXT: [[ADD:%.*]] = add <[[#div(VBITS, 32)]] x i32> [[Y]], [[X]]
55-
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v[[#div(VBITS, 32)]]i32(<vscale x 4 x i32> undef, <[[#div(VBITS, 32)]] x i32> [[ADD]], i64 0)
55+
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v[[#div(VBITS, 32)]]i32(<vscale x 4 x i32> undef, <[[#div(VBITS, 32)]] x i32> [[ADD]], i64 0)
5656
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
5757
typedef svint32_t vec __attribute__((arm_sve_vector_bits(N)));
5858
auto f(vec x, vec y) { return x + y; } // Returns a vec.
@@ -68,11 +68,11 @@ typedef svint16_t vec2 __attribute__((arm_sve_vector_bits(N)));
6868
// CHECK-SAME: [[#VBITS]]
6969
// CHECK-SAME: EE(<vscale x 8 x i16> noundef %x.coerce)
7070
// CHECK-NEXT: entry:
71-
// CHECK128-NEXT: [[X:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
71+
// CHECK128-NEXT: [[X:%.*]] = tail call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
7272
// CHECK128-NEXT: call void @_Z1fDv8_s(<8 x i16> noundef [[X]]) [[ATTR5:#.*]]
7373
// CHECK128-NEXT: ret void
7474
// CHECKWIDE-NEXT: [[INDIRECT_ARG_TEMP:%.*]] = alloca <[[#div(VBITS, 16)]] x i16>, align 16
75-
// CHECKWIDE-NEXT: [[X:%.*]] = call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
75+
// CHECKWIDE-NEXT: [[X:%.*]] = tail call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16(<vscale x 8 x i16> [[X_COERCE:%.*]], i64 0)
7676
// CHECKWIDE-NEXT: store <[[#div(VBITS, 16)]] x i16> [[X]], <[[#div(VBITS, 16)]] x i16>* [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6:!tbaa !.*]]
7777
// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS, 16)]]_s(<[[#div(VBITS, 16)]] x i16>* noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]]
7878
// CHECKWIDE-NEXT: ret void

0 commit comments

Comments
 (0)