Skip to content

Commit d586a3e

Browse files
committed
cleanup rebase disaster for VPCLMUL
1 parent 4bbb61a commit d586a3e

File tree

1 file changed

+131
-46
lines changed

1 file changed

+131
-46
lines changed

crates/core_arch/src/x86/avx512vpclmulqdq.rs

Lines changed: 131 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ extern "C" {
3232
///
3333
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
3434
#[inline]
35-
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
35+
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
3636
// technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
3737
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
3838
#[rustc_args_required_const(2)]
@@ -57,6 +57,26 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
5757
#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
5858
#[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
5959
#[rustc_args_required_const(2)]
60+
pub unsafe fn _mm256_clmulepi64_epi128(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
61+
macro_rules! call {
62+
($imm8:expr) => {
63+
pclmulqdq_256(a, b, $imm8)
64+
};
65+
}
66+
constify_imm8!(imm8, call)
67+
}
68+
69+
#[cfg(test)]
70+
mod tests {
71+
// The constants in the tests below are just bit patterns. They should not
72+
// be interpreted as integers; signedness does not make sense for them, but
73+
// __mXXXi happens to be defined in terms of signed integers.
74+
#![allow(overflowing_literals)]
75+
76+
use stdarch_test::simd_test;
77+
78+
use crate::core_arch::x86::*;
79+
6080
macro_rules! verify_kat_pclmul {
6181
($broadcast:ident, $clmul:ident, $assert:ident) => {
6282
// Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
@@ -72,14 +92,14 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
7292
let r10 = $broadcast(r10);
7393
let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
7494
let r11 = $broadcast(r11);
75-
95+
7696
$assert($clmul(a, b, 0x00), r00);
7797
$assert($clmul(a, b, 0x10), r01);
7898
$assert($clmul(a, b, 0x01), r10);
7999
$assert($clmul(a, b, 0x11), r11);
80-
100+
81101
let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
82-
let a0 = $broadcast(a0);
102+
let a0 = $broadcast(a0);
83103
let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
84104
let r = $broadcast(r);
85105
$assert($clmul(a0, a0, 0x00), r);
@@ -88,94 +108,159 @@ pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m
88108

89109
macro_rules! unroll {
90110
($target:ident[4] = $op:ident($source:ident,4);) => {
91-
$target[3] = $op($source,3);
92-
$target[2] = $op($source,2);
93-
unroll!{$target[2] = $op($source,2);}
111+
$target[3] = $op($source, 3);
112+
$target[2] = $op($source, 2);
113+
unroll! {$target[2] = $op($source,2);}
94114
};
95115
($target:ident[2] = $op:ident($source:ident,2);) => {
96-
$target[1] = $op($source,1);
97-
$target[0] = $op($source,0);
116+
$target[1] = $op($source, 1);
117+
$target[0] = $op($source, 0);
98118
};
99119
(assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => {
100-
assert_eq_m128i($op($vec_res,3),$lin_res[3]);
101-
assert_eq_m128i($op($vec_res,2),$lin_res[2]);
102-
unroll!{assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
120+
assert_eq_m128i($op($vec_res, 3), $lin_res[3]);
121+
assert_eq_m128i($op($vec_res, 2), $lin_res[2]);
122+
unroll! {assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
103123
};
104124
(assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => {
105-
assert_eq_m128i($op($vec_res,1),$lin_res[1]);
106-
assert_eq_m128i($op($vec_res,0),$lin_res[0]);
107-
}
125+
assert_eq_m128i($op($vec_res, 1), $lin_res[1]);
126+
assert_eq_m128i($op($vec_res, 0), $lin_res[0]);
127+
};
108128
}
109129

110130
// this function tests one of the possible 4 instances
111131
// with different inputs across lanes
112132
#[target_feature(enable = "avx512vpclmulqdq,avx512f")]
113-
unsafe fn verify_512_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m512i,__m512i)->__m512i) {
133+
unsafe fn verify_512_helper(
134+
linear: unsafe fn(__m128i, __m128i) -> __m128i,
135+
vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
136+
) {
114137
let a = _mm512_set_epi64(
115-
0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E,
116-
0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD
138+
0xDCB4DB3657BF0B7D,
139+
0x18DB0601068EDD9F,
140+
0xB76B908233200DC5,
141+
0xE478235FA8E22D5E,
142+
0xAB05CFFA2621154C,
143+
0x1171B47A186174C9,
144+
0x8C6B6C0E7595CEC9,
145+
0xBE3E7D4934E961BD,
117146
);
118147
let b = _mm512_set_epi64(
119-
0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187,
120-
0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5
148+
0x672F6F105A94CEA7,
149+
0x8298B8FFCA5F829C,
150+
0xA3927047B3FB61D8,
151+
0x978093862CDE7187,
152+
0xB1927AB22F31D0EC,
153+
0xA9A5DA619BE4D7AF,
154+
0xCA2590F56884FDC6,
155+
0x19BE9F660038BDB5,
121156
);
122157

123-
let mut a_decomp = [_mm_setzero_si128();4];
158+
let mut a_decomp = [_mm_setzero_si128(); 4];
124159
unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
125-
let mut b_decomp = [_mm_setzero_si128();4];
160+
let mut b_decomp = [_mm_setzero_si128(); 4];
126161
unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
127162

128163
let r = vectorized(a, b);
129-
let mut e_decomp = [_mm_setzero_si128();4];
164+
let mut e_decomp = [_mm_setzero_si128(); 4];
130165
for i in 0..4 {
131-
e_decomp[i] = linear(a_decomp[i],b_decomp[i]);
166+
e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
132167
}
133-
unroll!{assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
168+
unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
134169
}
135170

136171
// this function tests one of the possible 4 instances
137172
// with different inputs across lanes for the VL version
138173
#[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
139-
unsafe fn verify_256_helper(linear : unsafe fn(__m128i,__m128i)->__m128i, vectorized : unsafe fn(__m256i,__m256i)->__m256i) {
174+
unsafe fn verify_256_helper(
175+
linear: unsafe fn(__m128i, __m128i) -> __m128i,
176+
vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
177+
) {
140178
let a = _mm512_set_epi64(
141-
0xDCB4DB3657BF0B7D, 0x18DB0601068EDD9F, 0xB76B908233200DC5, 0xE478235FA8E22D5E,
142-
0xAB05CFFA2621154C, 0x1171B47A186174C9, 0x8C6B6C0E7595CEC9, 0xBE3E7D4934E961BD
179+
0xDCB4DB3657BF0B7D,
180+
0x18DB0601068EDD9F,
181+
0xB76B908233200DC5,
182+
0xE478235FA8E22D5E,
183+
0xAB05CFFA2621154C,
184+
0x1171B47A186174C9,
185+
0x8C6B6C0E7595CEC9,
186+
0xBE3E7D4934E961BD,
143187
);
144188
let b = _mm512_set_epi64(
145-
0x672F6F105A94CEA7, 0x8298B8FFCA5F829C, 0xA3927047B3FB61D8, 0x978093862CDE7187,
146-
0xB1927AB22F31D0EC, 0xA9A5DA619BE4D7AF, 0xCA2590F56884FDC6, 0x19BE9F660038BDB5
189+
0x672F6F105A94CEA7,
190+
0x8298B8FFCA5F829C,
191+
0xA3927047B3FB61D8,
192+
0x978093862CDE7187,
193+
0xB1927AB22F31D0EC,
194+
0xA9A5DA619BE4D7AF,
195+
0xCA2590F56884FDC6,
196+
0x19BE9F660038BDB5,
147197
);
148198

149-
let mut a_decomp = [_mm_setzero_si128();2];
199+
let mut a_decomp = [_mm_setzero_si128(); 2];
150200
unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
151-
let mut b_decomp = [_mm_setzero_si128();2];
201+
let mut b_decomp = [_mm_setzero_si128(); 2];
152202
unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
153203

154-
let r = vectorized(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
155-
let mut e_decomp = [_mm_setzero_si128();2];
204+
let r = vectorized(
205+
_mm512_extracti64x4_epi64(a, 0),
206+
_mm512_extracti64x4_epi64(b, 0),
207+
);
208+
let mut e_decomp = [_mm_setzero_si128(); 2];
156209
for i in 0..2 {
157-
e_decomp[i] = linear(a_decomp[i],b_decomp[i]);
210+
e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
158211
}
159-
unroll!{assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
212+
unroll! {assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
160213
}
161214

162215
#[simd_test(enable = "avx512vpclmulqdq,avx512f")]
163216
unsafe fn test_mm512_clmulepi64_epi128() {
164-
verify_kat_pclmul!(_mm512_broadcast_i32x4,_mm512_clmulepi64_epi128,assert_eq_m512i);
217+
verify_kat_pclmul!(
218+
_mm512_broadcast_i32x4,
219+
_mm512_clmulepi64_epi128,
220+
assert_eq_m512i
221+
);
165222

166-
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm512_clmulepi64_epi128(a, b, 0x00));
167-
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm512_clmulepi64_epi128(a, b, 0x01));
168-
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm512_clmulepi64_epi128(a, b, 0x10));
169-
verify_512_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm512_clmulepi64_epi128(a, b, 0x11));
223+
verify_512_helper(
224+
|a, b| _mm_clmulepi64_si128(a, b, 0x00),
225+
|a, b| _mm512_clmulepi64_epi128(a, b, 0x00),
226+
);
227+
verify_512_helper(
228+
|a, b| _mm_clmulepi64_si128(a, b, 0x01),
229+
|a, b| _mm512_clmulepi64_epi128(a, b, 0x01),
230+
);
231+
verify_512_helper(
232+
|a, b| _mm_clmulepi64_si128(a, b, 0x10),
233+
|a, b| _mm512_clmulepi64_epi128(a, b, 0x10),
234+
);
235+
verify_512_helper(
236+
|a, b| _mm_clmulepi64_si128(a, b, 0x11),
237+
|a, b| _mm512_clmulepi64_epi128(a, b, 0x11),
238+
);
170239
}
171240

172241
#[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
173242
unsafe fn test_mm256_clmulepi64_epi128() {
174-
verify_kat_pclmul!(_mm256_broadcastsi128_si256,_mm256_clmulepi64_epi128,assert_eq_m256i);
243+
verify_kat_pclmul!(
244+
_mm256_broadcastsi128_si256,
245+
_mm256_clmulepi64_epi128,
246+
assert_eq_m256i
247+
);
175248

176-
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x00),|a,b|_mm256_clmulepi64_epi128(a, b, 0x00));
177-
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x01),|a,b|_mm256_clmulepi64_epi128(a, b, 0x01));
178-
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x10),|a,b|_mm256_clmulepi64_epi128(a, b, 0x10));
179-
verify_256_helper(|a,b|_mm_clmulepi64_si128(a, b, 0x11),|a,b|_mm256_clmulepi64_epi128(a, b, 0x11));
249+
verify_256_helper(
250+
|a, b| _mm_clmulepi64_si128(a, b, 0x00),
251+
|a, b| _mm256_clmulepi64_epi128(a, b, 0x00),
252+
);
253+
verify_256_helper(
254+
|a, b| _mm_clmulepi64_si128(a, b, 0x01),
255+
|a, b| _mm256_clmulepi64_epi128(a, b, 0x01),
256+
);
257+
verify_256_helper(
258+
|a, b| _mm_clmulepi64_si128(a, b, 0x10),
259+
|a, b| _mm256_clmulepi64_epi128(a, b, 0x10),
260+
);
261+
verify_256_helper(
262+
|a, b| _mm_clmulepi64_si128(a, b, 0x11),
263+
|a, b| _mm256_clmulepi64_epi128(a, b, 0x11),
264+
);
180265
}
181266
}

0 commit comments

Comments
 (0)