1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
1
2
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
2
3
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
3
4
@@ -10,67 +11,95 @@ declare i64 @llvm.ctlz.i64(i64, i1) readnone
10
11
; There should be no difference between llvm.ctlz.i32(%a, true) and
11
12
; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
12
13
13
- ; CHECK-LABEL: myctlz(
14
14
define i32 @myctlz (i32 %a ) {
15
- ; CHECK: ld.param.
16
- ; CHECK-NEXT: clz.b32
17
- ; CHECK-NEXT: st.param.
18
- ; CHECK-NEXT: ret;
15
+ ; CHECK-LABEL: myctlz(
16
+ ; CHECK: {
17
+ ; CHECK-NEXT: .reg .b32 %r<3>;
18
+ ; CHECK-EMPTY:
19
+ ; CHECK-NEXT: // %bb.0:
20
+ ; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
21
+ ; CHECK-NEXT: clz.b32 %r2, %r1;
22
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
23
+ ; CHECK-NEXT: ret;
19
24
%val = call i32 @llvm.ctlz.i32 (i32 %a , i1 false ) readnone
20
25
ret i32 %val
21
26
}
22
- ; CHECK-LABEL: myctlz_2(
23
27
define i32 @myctlz_2 (i32 %a ) {
24
- ; CHECK: ld.param.
25
- ; CHECK-NEXT: clz.b32
26
- ; CHECK-NEXT: st.param.
27
- ; CHECK-NEXT: ret;
28
+ ; CHECK-LABEL: myctlz_2(
29
+ ; CHECK: {
30
+ ; CHECK-NEXT: .reg .b32 %r<3>;
31
+ ; CHECK-EMPTY:
32
+ ; CHECK-NEXT: // %bb.0:
33
+ ; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
34
+ ; CHECK-NEXT: clz.b32 %r2, %r1;
35
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
36
+ ; CHECK-NEXT: ret;
28
37
%val = call i32 @llvm.ctlz.i32 (i32 %a , i1 true ) readnone
29
38
ret i32 %val
30
39
}
31
40
32
41
; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
33
42
; value, so here we have to zero-extend it.
34
- ; CHECK-LABEL: myctlz64(
35
43
define i64 @myctlz64 (i64 %a ) {
36
- ; CHECK: ld.param.
37
- ; CHECK-NEXT: clz.b64
38
- ; CHECK-NEXT: cvt.u64.u32
39
- ; CHECK-NEXT: st.param.
40
- ; CHECK-NEXT: ret;
44
+ ; CHECK-LABEL: myctlz64(
45
+ ; CHECK: {
46
+ ; CHECK-NEXT: .reg .b32 %r<2>;
47
+ ; CHECK-NEXT: .reg .b64 %rd<3>;
48
+ ; CHECK-EMPTY:
49
+ ; CHECK-NEXT: // %bb.0:
50
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
51
+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
52
+ ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
53
+ ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
54
+ ; CHECK-NEXT: ret;
41
55
%val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
42
56
ret i64 %val
43
57
}
44
- ; CHECK-LABEL: myctlz64_2(
45
58
define i64 @myctlz64_2 (i64 %a ) {
46
- ; CHECK: ld.param.
47
- ; CHECK-NEXT: clz.b64
48
- ; CHECK-NEXT: cvt.u64.u32
49
- ; CHECK-NEXT: st.param.
50
- ; CHECK-NEXT: ret;
59
+ ; CHECK-LABEL: myctlz64_2(
60
+ ; CHECK: {
61
+ ; CHECK-NEXT: .reg .b32 %r<2>;
62
+ ; CHECK-NEXT: .reg .b64 %rd<3>;
63
+ ; CHECK-EMPTY:
64
+ ; CHECK-NEXT: // %bb.0:
65
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
66
+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
67
+ ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
68
+ ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
69
+ ; CHECK-NEXT: ret;
51
70
%val = call i64 @llvm.ctlz.i64 (i64 %a , i1 true ) readnone
52
71
ret i64 %val
53
72
}
54
73
55
74
; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
56
75
; natural return width of ptx's clz.b64 instruction. No conversions should be
57
76
; necessary in the PTX.
58
- ; CHECK-LABEL: myctlz64_as_32(
59
77
define i32 @myctlz64_as_32 (i64 %a ) {
60
- ; CHECK: ld.param.
61
- ; CHECK-NEXT: clz.b64
62
- ; CHECK-NEXT: st.param.
63
- ; CHECK-NEXT: ret;
78
+ ; CHECK-LABEL: myctlz64_as_32(
79
+ ; CHECK: {
80
+ ; CHECK-NEXT: .reg .b32 %r<2>;
81
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
82
+ ; CHECK-EMPTY:
83
+ ; CHECK-NEXT: // %bb.0:
84
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
85
+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
86
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
87
+ ; CHECK-NEXT: ret;
64
88
%val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
65
89
%trunc = trunc i64 %val to i32
66
90
ret i32 %trunc
67
91
}
68
- ; CHECK-LABEL: myctlz64_as_32_2(
69
92
define i32 @myctlz64_as_32_2 (i64 %a ) {
70
- ; CHECK: ld.param.
71
- ; CHECK-NEXT: clz.b64
72
- ; CHECK-NEXT: st.param.
73
- ; CHECK-NEXT: ret;
93
+ ; CHECK-LABEL: myctlz64_as_32_2(
94
+ ; CHECK: {
95
+ ; CHECK-NEXT: .reg .b32 %r<2>;
96
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
97
+ ; CHECK-EMPTY:
98
+ ; CHECK-NEXT: // %bb.0:
99
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
100
+ ; CHECK-NEXT: clz.b64 %r1, %rd1;
101
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
102
+ ; CHECK-NEXT: ret;
74
103
%val = call i64 @llvm.ctlz.i64 (i64 %a , i1 false ) readnone
75
104
%trunc = trunc i64 %val to i32
76
105
ret i32 %trunc
@@ -80,53 +109,67 @@ define i32 @myctlz64_as_32_2(i64 %a) {
80
109
; and then truncating the result back down to i16. But the NVPTX ABI
81
110
; zero-extends i16 return values to i32, so the final truncation doesn't appear
82
111
; in this function.
83
- ; CHECK-LABEL: myctlz_ret16(
84
112
define i16 @myctlz_ret16 (i16 %a ) {
85
- ; CHECK: ld.param.
86
- ; CHECK-NEXT: cvt.u32.u16
87
- ; CHECK-NEXT: clz.b32
88
- ; CHECK-NEXT: sub.
89
- ; CHECK-NEXT: st.param.
90
- ; CHECK-NEXT: ret;
113
+ ; CHECK-LABEL: myctlz_ret16(
114
+ ; CHECK: {
115
+ ; CHECK-NEXT: .reg .b32 %r<4>;
116
+ ; CHECK-EMPTY:
117
+ ; CHECK-NEXT: // %bb.0:
118
+ ; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
119
+ ; CHECK-NEXT: clz.b32 %r2, %r1;
120
+ ; CHECK-NEXT: add.s32 %r3, %r2, -16;
121
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
122
+ ; CHECK-NEXT: ret;
91
123
%val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
92
124
ret i16 %val
93
125
}
94
- ; CHECK-LABEL: myctlz_ret16_2(
95
126
define i16 @myctlz_ret16_2 (i16 %a ) {
96
- ; CHECK: ld.param.
97
- ; CHECK-NEXT: cvt.u32.u16
98
- ; CHECK-NEXT: clz.b32
99
- ; CHECK-NEXT: sub.
100
- ; CHECK-NEXT: st.param.
101
- ; CHECK-NEXT: ret;
127
+ ; CHECK-LABEL: myctlz_ret16_2(
128
+ ; CHECK: {
129
+ ; CHECK-NEXT: .reg .b32 %r<4>;
130
+ ; CHECK-EMPTY:
131
+ ; CHECK-NEXT: // %bb.0:
132
+ ; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
133
+ ; CHECK-NEXT: shl.b32 %r2, %r1, 16;
134
+ ; CHECK-NEXT: clz.b32 %r3, %r2;
135
+ ; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
136
+ ; CHECK-NEXT: ret;
102
137
%val = call i16 @llvm.ctlz.i16 (i16 %a , i1 true ) readnone
103
138
ret i16 %val
104
139
}
105
140
106
141
; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
107
142
; remain.
108
- ; CHECK-LABEL: myctlz_store16(
109
143
define void @myctlz_store16 (i16 %a , ptr %b ) {
110
- ; CHECK: ld.param.
111
- ; CHECK-NEXT: cvt.u32.u16
112
- ; CHECK-NEXT: clz.b32
113
- ; CHECK-DAG: cvt.u16.u32
114
- ; CHECK-DAG: sub.
115
- ; CHECK: st.{{[a-z]}}16
116
- ; CHECK: ret;
144
+ ; CHECK-LABEL: myctlz_store16(
145
+ ; CHECK: {
146
+ ; CHECK-NEXT: .reg .b32 %r<4>;
147
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
148
+ ; CHECK-EMPTY:
149
+ ; CHECK-NEXT: // %bb.0:
150
+ ; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
151
+ ; CHECK-NEXT: clz.b32 %r2, %r1;
152
+ ; CHECK-NEXT: add.s32 %r3, %r2, -16;
153
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
154
+ ; CHECK-NEXT: st.u16 [%rd1], %r3;
155
+ ; CHECK-NEXT: ret;
117
156
%val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
118
157
store i16 %val , ptr %b
119
158
ret void
120
159
}
121
- ; CHECK-LABEL: myctlz_store16_2(
122
160
define void @myctlz_store16_2 (i16 %a , ptr %b ) {
123
- ; CHECK: ld.param.
124
- ; CHECK-NEXT: cvt.u32.u16
125
- ; CHECK-NEXT: clz.b32
126
- ; CHECK-DAG: cvt.u16.u32
127
- ; CHECK-DAG: sub.
128
- ; CHECK: st.{{[a-z]}}16
129
- ; CHECK: ret;
161
+ ; CHECK-LABEL: myctlz_store16_2(
162
+ ; CHECK: {
163
+ ; CHECK-NEXT: .reg .b32 %r<4>;
164
+ ; CHECK-NEXT: .reg .b64 %rd<2>;
165
+ ; CHECK-EMPTY:
166
+ ; CHECK-NEXT: // %bb.0:
167
+ ; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
168
+ ; CHECK-NEXT: clz.b32 %r2, %r1;
169
+ ; CHECK-NEXT: add.s32 %r3, %r2, -16;
170
+ ; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
171
+ ; CHECK-NEXT: st.u16 [%rd1], %r3;
172
+ ; CHECK-NEXT: ret;
130
173
%val = call i16 @llvm.ctlz.i16 (i16 %a , i1 false ) readnone
131
174
store i16 %val , ptr %b
132
175
ret void
0 commit comments