|
17 | 17 | # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
|
18 | 18 | #endif
|
19 | 19 |
|
| 20 | +#pragma push_macro("SM_53") |
20 | 21 | #pragma push_macro("SM_70")
|
21 | 22 | #pragma push_macro("SM_72")
|
22 | 23 | #pragma push_macro("SM_75")
|
|
30 | 31 |
|
31 | 32 | #pragma push_macro("SM_60")
|
32 | 33 | #define SM_60 "sm_60|sm_61|sm_62|" SM_70
|
| 34 | +#define SM_53 "sm_53|" SM_60 |
33 | 35 |
|
| 36 | +#pragma push_macro("PTX42") |
34 | 37 | #pragma push_macro("PTX60")
|
35 | 38 | #pragma push_macro("PTX61")
|
36 | 39 | #pragma push_macro("PTX63")
|
|
53 | 56 | #define PTX63 "ptx63|" PTX64
|
54 | 57 | #define PTX61 "ptx61|" PTX63
|
55 | 58 | #define PTX60 "ptx60|" PTX61
|
| 59 | +#define PTX42 "ptx42|" PTX60 |
56 | 60 |
|
57 | 61 | #pragma push_macro("AND")
|
58 | 62 | #define AND(a, b) "(" a "),(" b ")"
|
@@ -110,13 +114,89 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
|
110 | 114 |
|
111 | 115 | // Min Max
|
112 | 116 |
|
113 |
| -BUILTIN(__nvvm_fmax_ftz_f, "fff", "") |
114 |
| -BUILTIN(__nvvm_fmax_f, "fff", "") |
115 |
| -BUILTIN(__nvvm_fmin_ftz_f, "fff", "") |
116 |
| -BUILTIN(__nvvm_fmin_f, "fff", "") |
| 117 | +TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70)) |
| 118 | +TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70)) |
| 119 | +TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70)) |
| 120 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) |
| 121 | +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 122 | +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 123 | +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 124 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "", |
| 125 | + AND(SM_86, PTX72)) |
| 126 | +TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 127 | +TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 128 | +TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 129 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 130 | +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 131 | + AND(SM_86, PTX72)) |
| 132 | +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 133 | + AND(SM_86, PTX72)) |
| 134 | +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 135 | + AND(SM_86, PTX72)) |
| 136 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 137 | + AND(SM_86, PTX72)) |
| 138 | +TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80, PTX70)) |
| 139 | +TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) |
| 140 | +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) |
| 141 | +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "UsUsUs", "", |
| 142 | + AND(SM_86, PTX72)) |
| 143 | +TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) |
| 144 | +TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) |
| 145 | +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "ZUiZUiZUi", "", |
| 146 | + AND(SM_86, PTX72)) |
| 147 | +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", |
| 148 | + AND(SM_86, PTX72)) |
| 149 | +BUILTIN(__nvvm_fmin_f, "fff", "") |
| 150 | +BUILTIN(__nvvm_fmin_ftz_f, "fff", "") |
| 151 | +TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70)) |
| 152 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) |
| 153 | +TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 154 | +TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 155 | +TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 156 | +TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 157 | +BUILTIN(__nvvm_fmin_d, "ddd", "") |
117 | 158 |
|
| 159 | +TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70)) |
| 160 | +TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70)) |
| 161 | +TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70)) |
| 162 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70)) |
| 163 | +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 164 | +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 165 | +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72)) |
| 166 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "", |
| 167 | + AND(SM_86, PTX72)) |
| 168 | +TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 169 | +TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 170 | +TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 171 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70)) |
| 172 | +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 173 | + AND(SM_86, PTX72)) |
| 174 | +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 175 | + AND(SM_86, PTX72)) |
| 176 | +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 177 | + AND(SM_86, PTX72)) |
| 178 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "", |
| 179 | + AND(SM_86, PTX72)) |
| 180 | +TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80, PTX70)) |
| 181 | +TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70)) |
| 182 | +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72)) |
| 183 | +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "UsUsUs", "", |
| 184 | + AND(SM_86, PTX72)) |
| 185 | +TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) |
| 186 | +TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70)) |
| 187 | +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "ZUiZUiZUi", "", |
| 188 | + AND(SM_86, PTX72)) |
| 189 | +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "", |
| 190 | + AND(SM_86, PTX72)) |
| 191 | +BUILTIN(__nvvm_fmax_f, "fff", "") |
| 192 | +BUILTIN(__nvvm_fmax_ftz_f, "fff", "") |
| 193 | +TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70)) |
| 194 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70)) |
| 195 | +TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 196 | +TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 197 | +TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
| 198 | +TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72)) |
118 | 199 | BUILTIN(__nvvm_fmax_d, "ddd", "")
|
119 |
| -BUILTIN(__nvvm_fmin_d, "ddd", "") |
120 | 200 |
|
121 | 201 | // Multiplication
|
122 | 202 |
|
@@ -228,6 +308,22 @@ TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
|
228 | 308 |
|
229 | 309 | // Fma
|
230 | 310 |
|
| 311 | +TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42)) |
| 312 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42)) |
| 313 | +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42)) |
| 314 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42)) |
| 315 | +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70)) |
| 316 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70)) |
| 317 | +TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) |
| 318 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) |
| 319 | +TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) |
| 320 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42)) |
| 321 | +TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) |
| 322 | +TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70)) |
| 323 | +TARGET_BUILTIN(__nvvm_fma_rn_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) |
| 324 | +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "UsUsUsUs", "", AND(SM_80, PTX70)) |
| 325 | +TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) |
| 326 | +TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70)) |
231 | 327 | BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
|
232 | 328 | BUILTIN(__nvvm_fma_rn_f, "ffff", "")
|
233 | 329 | BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "")
|
@@ -2309,15 +2405,24 @@ TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
|
2309 | 2405 | TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
|
2310 | 2406 | TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
|
2311 | 2407 |
|
| 2408 | + |
| 2409 | +// bf16, bf16x2 abs, neg |
| 2410 | +TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70)) |
| 2411 | +TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) |
| 2412 | +TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70)) |
| 2413 | +TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70)) |
| 2414 | + |
2312 | 2415 | #undef BUILTIN
|
2313 | 2416 | #undef TARGET_BUILTIN
|
2314 | 2417 | #pragma pop_macro("AND")
|
| 2418 | +#pragma pop_macro("SM_53") |
2315 | 2419 | #pragma pop_macro("SM_60")
|
2316 | 2420 | #pragma pop_macro("SM_70")
|
2317 | 2421 | #pragma pop_macro("SM_72")
|
2318 | 2422 | #pragma pop_macro("SM_75")
|
2319 | 2423 | #pragma pop_macro("SM_80")
|
2320 | 2424 | #pragma pop_macro("SM_86")
|
| 2425 | +#pragma pop_macro("PTX42") |
2321 | 2426 | #pragma pop_macro("PTX60")
|
2322 | 2427 | #pragma pop_macro("PTX61")
|
2323 | 2428 | #pragma pop_macro("PTX63")
|
|
0 commit comments