From b6d58eb2738640849de1b34444a73472073d0998 Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Mon, 11 Nov 2024 07:28:46 +0000 Subject: [PATCH 1/7] Initial --- crates/core_arch/src/x86/sha.rs | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 144677818a..561ad42309 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -16,6 +16,12 @@ extern "C" { fn sha256msg2(a: i32x4, b: i32x4) -> i32x4; #[link_name = "llvm.x86.sha256rnds2"] fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4; + #[link_name = "llvm.x86.vsha512msg1"] + fn vsha512msg1(a: i32x8, b: i32x4) -> i32x8; + #[link_name = "llvm.x86.vsha512msg2"] + fn vsha512msg2(a: i32x8, b: i32x8) -> i32x8; + #[link_name = "llvm.x86.vsha512rnds2"] + fn vsha512rnds2_epi64(a: i32x8, b: i32x8, c: i32x4) -> i32x4; } #[cfg(test)] @@ -118,6 +124,30 @@ pub unsafe fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m12 transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) } + +/// Performs an intermediate calculation for the next four SHA512 message qwords. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512msg1))] +#[unstable(feature = "sha512", issue = "none")] +pub unsafe fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i { + transmute(vsha512msg1(a.as_i32x8(), b.as_i32x4())) +} + + +/// Performs the final calculation for the next four SHA512 message qwords. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512msg2))] +#[unstable(feature = "sha512", issue = "none")] +pub unsafe fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i { + transmute(vsha512msg2(a.as_i32x8(), b.as_i32x8())) +} + #[cfg(test)] mod tests { use std::{ @@ -215,4 +245,14 @@ mod tests { let r = _mm_sha256rnds2_epu32(a, b, k); assert_eq_m128i(r, expected); } + + #[simd_test(enable = "sha512,avx")] + #[allow(overflowing_literals)] + unsafe fn test_mm256_sha512msg1_epi64() { + let a = _mm256_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98, 0x0, 0x0); + let b = _mm_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b); + let expected = _mm256_set_epi64x(0xeb84973fd5cda67d, 0x2857b88f406b09ee, 0x0, 0x0); + let r = _mm256_sha512msg1_epi64(a, b); + assert_eq_m256i(r, expected); + } } From 6bcf9ed628c2b971b89e54d6446ec3e369911e2d Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 04:57:14 +0000 Subject: [PATCH 2/7] Define all three instrinsics --- crates/core_arch/src/x86/sha.rs | 47 +++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 561ad42309..16ee3326bd 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -17,11 +17,11 @@ extern "C" { #[link_name = "llvm.x86.sha256rnds2"] fn sha256rnds2(a: i32x4, b: i32x4, k: i32x4) -> i32x4; #[link_name = "llvm.x86.vsha512msg1"] - fn vsha512msg1(a: i32x8, b: i32x4) -> i32x8; + fn vsha512msg1(a: i64x4, b: i64x2) -> i64x4; #[link_name = "llvm.x86.vsha512msg2"] - fn vsha512msg2(a: i32x8, b: i32x8) -> i32x8; + fn vsha512msg2(a: i64x4, b: i64x4) -> i64x4; #[link_name = "llvm.x86.vsha512rnds2"] - fn vsha512rnds2_epi64(a: i32x8, b: i32x8, c: i32x4) -> i32x4; + fn vsha512rnds2(a: i64x4, b: i64x4, c: i64x2) -> i64x4; } #[cfg(test)] @@ -133,7 +133,7 @@ pub unsafe fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m12 #[cfg_attr(test, assert_instr(vsha512msg1))] #[unstable(feature = "sha512", issue = "none")] pub unsafe fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i { - transmute(vsha512msg1(a.as_i32x8(), b.as_i32x4())) + transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) } @@ -145,7 +145,22 @@ pub unsafe fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i { #[cfg_attr(test, assert_instr(vsha512msg2))] #[unstable(feature = "sha512", issue = "none")] pub unsafe fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i { - transmute(vsha512msg2(a.as_i32x8(), b.as_i32x8())) + transmute(vsha512msg2(a.as_i64x4(), b.as_i64x4())) +} + +/// Performs two rounds of SHA512 operation using initial SHA512 state (C,D,G,H) from `a`, +/// an initial SHA512 state (A,B,E,F) from `b`, and a pre-computed sum of the next two +/// round message qwords and the corresponding round constants from `c` (only the two +/// lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is returned, and +/// can be used as the updated state (C,D,G,H) in later rounds. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512rnds2_epi64) +#[inline] +#[target_feature(enable = "sha512,avx")] +#[cfg_attr(test, assert_instr(vsha512rnds2))] +#[unstable(feature = "sha512", issue = "none")] +pub unsafe fn _mm256_sha512rnds2_epi64(a: __m256i, b: __m256i, c: __m128i) -> __m256i { + transmute(vsha512rnds2(a.as_i64x4(), b.as_i64x4(), c.as_i64x2())) } #[cfg(test)] @@ -255,4 +270,26 @@ mod tests { let r = _mm256_sha512msg1_epi64(a, b); assert_eq_m256i(r, expected); } + + #[simd_test(enable = "sha512,avx")] + #[allow(overflowing_literals)] + unsafe fn test_mm256_sha512msg2_epi64() { + let a = _mm256_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98, 0x0, 0x0); + let b = _mm256_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98, 0x0, 0x0); + let expected = _mm256_set_epi64x(0xf714b202d863d47d, 0x90c30d946b3d3b35, 0x0, 0x0); + let r = _mm256_sha512msg2_epi64(a, b); + assert_eq_m256i(r, expected); + } + + + #[simd_test(enable = "sha512,avx")] + #[allow(overflowing_literals)] + unsafe fn test_mm_sha512rnds2_epi64() { + let a = _mm256_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98, 0x0, 0x0); + let b = _mm256_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b, 0x0, 0x0); + let k = _mm_set_epi64x(0, 0x12835b01d807aa98); + let expected = _mm256_set_epi64x(0xd3063037effb15ea, 0x187ee3db0d6d1d19, 0x0, 0x0); + let r = _mm256_sha512rnds2_epi64(a, b, k); + assert_eq_m256i(r, expected); + } } From 4e4d26118dd1f9d7bbe6b261a3d9710ccba487b3 Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 05:00:04 +0000 Subject: [PATCH 3/7] style --- crates/core_arch/src/x86/sha.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 16ee3326bd..94d145c04f 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -124,7 +124,6 @@ pub unsafe fn _mm_sha256rnds2_epu32(a: __m128i, b: __m128i, k: __m128i) -> __m12 transmute(sha256rnds2(a.as_i32x4(), b.as_i32x4(), k.as_i32x4())) } - /// Performs an intermediate calculation for the next four SHA512 message qwords. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg1_epi64) @@ -136,7 +135,6 @@ pub unsafe fn _mm256_sha512msg1_epi64(a: __m256i, b: __m128i) -> __m256i { transmute(vsha512msg1(a.as_i64x4(), b.as_i64x2())) } - /// Performs the final calculation for the next four SHA512 message qwords. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sha512msg2_epi64) From 5ad14ea385516a58d3da973f12f63e592e5380c1 Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 05:00:33 +0000 Subject: [PATCH 4/7] more style --- crates/core_arch/src/x86/sha.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 94d145c04f..5491b6a83e 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -279,7 +279,6 @@ mod tests { assert_eq_m256i(r, expected); } - #[simd_test(enable = "sha512,avx")] #[allow(overflowing_literals)] unsafe fn test_mm_sha512rnds2_epi64() { From e29c8467f1ff16011d3c4fef0245da2deb844d17 Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 05:02:23 +0000 Subject: [PATCH 5/7] last style --- crates/core_arch/src/x86/sha.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 5491b6a83e..5aa8bdd85f 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -147,7 +147,7 @@ pub unsafe fn _mm256_sha512msg2_epi64(a: __m256i, b: __m256i) -> __m256i { } /// Performs two rounds of SHA512 operation using initial SHA512 state (C,D,G,H) from `a`, -/// an initial SHA512 state (A,B,E,F) from `b`, and a pre-computed sum of the next two +/// an initial SHA512 state (A,B,E,F) from `b`, and a pre-computed sum of the next two /// round message qwords and the corresponding round constants from `c` (only the two /// lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is returned, and /// can be used as the updated state (C,D,G,H) in later rounds. From f4cf322855898a2d48e39b7b8bde1507d4b232ec Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 05:05:08 +0000 Subject: [PATCH 6/7] Fix test naming --- crates/core_arch/src/x86/sha.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core_arch/src/x86/sha.rs b/crates/core_arch/src/x86/sha.rs index 5aa8bdd85f..dc82b278da 100644 --- a/crates/core_arch/src/x86/sha.rs +++ b/crates/core_arch/src/x86/sha.rs @@ -281,7 +281,7 @@ mod tests { #[simd_test(enable = "sha512,avx")] #[allow(overflowing_literals)] - unsafe fn test_mm_sha512rnds2_epi64() { + unsafe fn test_mm256_sha512rnds2_epi64() { let a = _mm256_set_epi64x(0xe9b5dba5b5c0fbcf, 0x71374491428a2f98, 0x0, 0x0); let b = _mm256_set_epi64x(0xab1c5ed5923f82a4, 0x59f111f13956c25b, 0x0, 0x0); let k = _mm_set_epi64x(0, 0x12835b01d807aa98); From ffec0b23cbd2370b8de8be3898b4613ada05980e Mon Sep 17 00:00:00 2001 From: Jan Berktold Date: Tue, 12 Nov 2024 11:53:02 -0800 Subject: [PATCH 7/7] Update main.yml --- .github/workflows/main.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index eb4a7b6dbf..928dfc7688 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -221,6 +221,10 @@ jobs: shell: bash if: startsWith(matrix.target.tuple, 'thumb') || matrix.target.tuple == 'nvptx64-nvidia-cuda' + - run: objdump --version + if: matrix.target.tuple == 'x86_64-apple-darwin' + shell: bash + # Windows & OSX go straight to `run.sh` ... - run: ./ci/run.sh shell: bash