From 5c61a48d75a477f4c7b9bbd0af649af6428af5df Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Tue, 1 Mar 2016 14:03:58 +0100 Subject: [PATCH] deps: upgrade openssl to 1.0.2g PR-URL: https://github.com/nodejs/node/pull/5507 Reviewed-By: Fedor Indutny --- .../asm/x64-elf-gas/aes/aesni-mb-x86_64.s | 929 ---- .../asm/x64-elf-gas/aes/aesni-sha1-x86_64.s | 1335 +---- .../asm/x64-elf-gas/aes/aesni-sha256-x86_64.s | 4297 --------------- deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s | 1637 +----- deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s | 856 +-- deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s | 2 +- deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s | 423 +- .../openssl/asm/x64-elf-gas/bn/x86_64-mont5.s | 2117 +++----- .../asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s | 1529 +----- .../asm/x64-elf-gas/modes/aesni-gcm-x86_64.s | 752 +-- .../asm/x64-elf-gas/modes/ghash-x86_64.s | 557 +- .../asm/x64-elf-gas/sha/sha1-mb-x86_64.s | 4303 +-------------- .../openssl/asm/x64-elf-gas/sha/sha1-x86_64.s | 2813 +--------- .../asm/x64-elf-gas/sha/sha256-mb-x86_64.s | 4728 +---------------- .../asm/x64-elf-gas/sha/sha256-x86_64.s | 2353 +------- .../asm/x64-elf-gas/sha/sha512-x86_64.s | 3582 ------------- .../asm/x64-macosx-gas/aes/aesni-mb-x86_64.s | 929 ---- .../x64-macosx-gas/aes/aesni-sha1-x86_64.s | 1335 +---- .../x64-macosx-gas/aes/aesni-sha256-x86_64.s | 4296 --------------- .../openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s | 1632 +----- .../asm/x64-macosx-gas/bn/rsaz-x86_64.s | 857 +-- .../asm/x64-macosx-gas/bn/x86_64-gf2m.s | 2 +- .../asm/x64-macosx-gas/bn/x86_64-mont.s | 421 +- .../asm/x64-macosx-gas/bn/x86_64-mont5.s | 2103 +++----- .../x64-macosx-gas/ec/ecp_nistz256-x86_64.s | 1529 +----- .../x64-macosx-gas/modes/aesni-gcm-x86_64.s | 749 +-- .../asm/x64-macosx-gas/modes/ghash-x86_64.s | 557 +- .../asm/x64-macosx-gas/sha/sha1-mb-x86_64.s | 4303 +-------------- .../asm/x64-macosx-gas/sha/sha1-x86_64.s | 2813 +--------- .../asm/x64-macosx-gas/sha/sha256-mb-x86_64.s | 4728 +---------------- .../asm/x64-macosx-gas/sha/sha256-x86_64.s | 2353 +------- .../asm/x64-macosx-gas/sha/sha512-x86_64.s | 3581 ------------- .../asm/x64-win32-masm/bn/rsaz-avx2.asm | 171 +- .../asm/x64-win32-masm/bn/rsaz-x86_64.asm | 353 +- .../asm/x64-win32-masm/bn/x86_64-mont.asm | 219 +- .../asm/x64-win32-masm/bn/x86_64-mont5.asm | 1424 +++-- .../x64-win32-masm/ec/ecp_nistz256-x86_64.asm | 22 +- .../x64-win32-masm/modes/aesni-gcm-x86_64.asm | 2 +- deps/openssl/asm/x86-elf-gas/sha/sha1-586.s | 1175 ---- deps/openssl/asm/x86-elf-gas/sha/sha256-586.s | 2248 +------- .../openssl/asm/x86-macosx-gas/sha/sha1-586.s | 1173 ---- .../asm/x86-macosx-gas/sha/sha256-586.s | 2248 +------- .../asm/x86-win32-masm/sha/sha1-586.asm | 1174 ---- .../asm/x86-win32-masm/sha/sha256-586.asm | 2248 +------- .../asm_obsolete/x64-elf-gas/aes/aes-x86_64.s | 70 +- .../x64-elf-gas/aes/aesni-sha1-x86_64.s | 8 +- .../x64-elf-gas/aes/aesni-x86_64.s | 112 +- .../x64-elf-gas/aes/bsaes-x86_64.s | 158 +- .../x64-elf-gas/aes/vpaes-x86_64.s | 20 +- .../asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s | 218 +- .../asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s | 2 +- .../asm_obsolete/x64-elf-gas/bn/x86_64-mont.s | 108 +- .../x64-elf-gas/bn/x86_64-mont5.s | 901 +++- .../x64-elf-gas/camellia/cmll-x86_64.s | 2 +- .../x64-elf-gas/ec/ecp_nistz256-x86_64.s | 27 +- .../x64-elf-gas/modes/ghash-x86_64.s | 82 +- .../x64-elf-gas/sha/sha1-mb-x86_64.s | 16 +- .../x64-elf-gas/sha/sha1-x86_64.s | 8 +- .../x64-elf-gas/sha/sha256-mb-x86_64.s | 84 +- .../x64-elf-gas/sha/sha256-x86_64.s | 44 +- .../asm_obsolete/x64-elf-gas/x86_64cpuid.s | 48 +- .../x64-macosx-gas/aes/aes-x86_64.s | 70 +- .../x64-macosx-gas/aes/aesni-sha1-x86_64.s | 8 +- .../x64-macosx-gas/aes/aesni-x86_64.s | 112 +- .../x64-macosx-gas/aes/bsaes-x86_64.s | 158 +- .../x64-macosx-gas/aes/vpaes-x86_64.s | 20 +- .../x64-macosx-gas/bn/rsaz-x86_64.s | 219 +- .../x64-macosx-gas/bn/x86_64-gf2m.s | 2 +- .../x64-macosx-gas/bn/x86_64-mont.s | 108 +- .../x64-macosx-gas/bn/x86_64-mont5.s | 897 +++- .../x64-macosx-gas/camellia/cmll-x86_64.s | 2 +- .../x64-macosx-gas/ec/ecp_nistz256-x86_64.s | 27 +- .../x64-macosx-gas/modes/ghash-x86_64.s | 82 +- .../x64-macosx-gas/sha/sha1-mb-x86_64.s | 16 +- .../x64-macosx-gas/sha/sha1-x86_64.s | 8 +- .../x64-macosx-gas/sha/sha256-mb-x86_64.s | 84 +- .../x64-macosx-gas/sha/sha256-x86_64.s | 44 +- .../asm_obsolete/x64-macosx-gas/x86_64cpuid.s | 48 +- .../x64-win32-masm/bn/rsaz-x86_64.asm | 291 +- .../x64-win32-masm/bn/x86_64-mont.asm | 108 +- .../x64-win32-masm/bn/x86_64-mont5.asm | 935 +++- .../x64-win32-masm/ec/ecp_nistz256-x86_64.asm | 11 +- deps/openssl/openssl/CHANGES | 134 +- deps/openssl/openssl/Configure | 8 +- deps/openssl/openssl/Makefile.shared | 6 +- deps/openssl/openssl/NEWS | 13 + deps/openssl/openssl/README | 2 +- deps/openssl/openssl/apps/apps.c | 8 +- deps/openssl/openssl/apps/apps.h | 2 +- deps/openssl/openssl/apps/pkeyutl.c | 90 +- deps/openssl/openssl/apps/req.c | 4 +- deps/openssl/openssl/apps/rsautl.c | 6 +- deps/openssl/openssl/apps/s_client.c | 2 - deps/openssl/openssl/apps/s_server.c | 49 +- deps/openssl/openssl/config | 3 +- deps/openssl/openssl/crypto/asn1/tasn_dec.c | 14 +- deps/openssl/openssl/crypto/bio/b_print.c | 187 +- deps/openssl/openssl/crypto/bio/bio.h | 4 +- deps/openssl/openssl/crypto/bio/bss_mem.c | 6 +- deps/openssl/openssl/crypto/bn/Makefile | 4 +- .../openssl/crypto/bn/asm/rsaz-avx2.pl | 219 +- .../openssl/crypto/bn/asm/rsaz-x86_64.pl | 375 +- .../openssl/crypto/bn/asm/x86_64-mont.pl | 227 +- .../openssl/crypto/bn/asm/x86_64-mont5.pl | 1276 +++-- deps/openssl/openssl/crypto/bn/bn.h | 14 +- deps/openssl/openssl/crypto/bn/bn_exp.c | 103 +- deps/openssl/openssl/crypto/bn/bn_print.c | 17 +- deps/openssl/openssl/crypto/bn/bn_recp.c | 1 + deps/openssl/openssl/crypto/cmac/cmac.c | 8 + deps/openssl/openssl/crypto/cryptlib.c | 6 +- deps/openssl/openssl/crypto/crypto.h | 2 +- deps/openssl/openssl/crypto/dh/dh.h | 2 +- deps/openssl/openssl/crypto/dh/dh_check.c | 7 +- deps/openssl/openssl/crypto/dsa/dsa_ameth.c | 22 +- deps/openssl/openssl/crypto/dso/dso_lib.c | 1 + .../crypto/ec/asm/ecp_nistz256-x86_64.pl | 11 +- deps/openssl/openssl/crypto/ec/ecp_nistp224.c | 4 +- deps/openssl/openssl/crypto/ec/ecp_nistp256.c | 4 +- deps/openssl/openssl/crypto/ec/ecp_nistp521.c | 4 +- deps/openssl/openssl/crypto/ec/ectest.c | 9 + deps/openssl/openssl/crypto/engine/eng_dyn.c | 4 +- deps/openssl/openssl/crypto/evp/e_des.c | 11 +- deps/openssl/openssl/crypto/evp/e_des3.c | 13 +- .../crypto/modes/asm/aesni-gcm-x86_64.pl | 4 +- .../openssl/crypto/modes/asm/ghash-x86_64.pl | 2 +- deps/openssl/openssl/crypto/modes/ctr128.c | 41 +- deps/openssl/openssl/crypto/opensslv.h | 6 +- .../openssl/crypto/perlasm/x86_64-xlate.pl | 7 +- deps/openssl/openssl/crypto/pkcs7/pk7_smime.c | 17 + deps/openssl/openssl/crypto/rsa/rsa_sign.c | 4 +- deps/openssl/openssl/crypto/srp/srp.h | 10 + deps/openssl/openssl/crypto/srp/srp_vfy.c | 57 +- deps/openssl/openssl/crypto/stack/stack.c | 2 +- deps/openssl/openssl/crypto/x509/x509_vfy.c | 70 +- deps/openssl/openssl/doc/apps/ciphers.pod | 59 +- deps/openssl/openssl/doc/apps/pkeyutl.pod | 13 + deps/openssl/openssl/doc/apps/req.pod | 9 +- deps/openssl/openssl/doc/apps/s_client.pod | 12 +- deps/openssl/openssl/doc/apps/s_server.pod | 8 +- deps/openssl/openssl/doc/crypto/BIO_s_mem.pod | 4 +- deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod | 33 +- deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod | 168 +- .../openssl/doc/ssl/SSL_CTX_set_options.pod | 10 + deps/openssl/openssl/doc/ssl/ssl.pod | 77 +- deps/openssl/openssl/engines/e_capi.c | 32 + deps/openssl/openssl/include/openssl/bio.h | 4 +- deps/openssl/openssl/include/openssl/bn.h | 14 +- deps/openssl/openssl/include/openssl/crypto.h | 2 +- deps/openssl/openssl/include/openssl/dh.h | 2 +- .../openssl/include/openssl/opensslv.h | 6 +- deps/openssl/openssl/include/openssl/srp.h | 10 + deps/openssl/openssl/include/openssl/ssl.h | 1 - deps/openssl/openssl/ms/uplink-x86.pl | 4 +- deps/openssl/openssl/openssl.spec | 2 +- deps/openssl/openssl/ssl/Makefile | 69 +- deps/openssl/openssl/ssl/s2_lib.c | 6 + deps/openssl/openssl/ssl/s3_lib.c | 69 +- deps/openssl/openssl/ssl/ssl.h | 1 - deps/openssl/openssl/ssl/ssl_conf.c | 10 +- deps/openssl/openssl/ssl/ssl_err.c | 1 - deps/openssl/openssl/ssl/ssl_lib.c | 14 +- deps/openssl/openssl/ssl/sslv2conftest.c | 231 + deps/openssl/openssl/test/Makefile | 35 +- deps/openssl/openssl/util/libeay.num | 2 + deps/openssl/openssl/util/mk1mf.pl | 4 +- deps/openssl/openssl/util/pl/BC-32.pl | 4 +- deps/openssl/openssl/util/pl/Mingw32.pl | 2 +- deps/openssl/openssl/util/pl/OS2-EMX.pl | 4 +- deps/openssl/openssl/util/pl/VC-32.pl | 10 +- deps/openssl/openssl/util/pl/linux.pl | 2 +- deps/openssl/openssl/util/pl/netware.pl | 8 +- deps/openssl/openssl/util/pl/ultrix.pl | 2 +- deps/openssl/openssl/util/pl/unix.pl | 2 +- 173 files changed, 9866 insertions(+), 76198 deletions(-) create mode 100644 deps/openssl/openssl/ssl/sslv2conftest.c diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s index 543b58831656cb..b0467f2f9f2995 100644 --- a/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-mb-x86_64.s @@ -6,14 +6,6 @@ .type aesni_multi_cbc_encrypt,@function .align 32 aesni_multi_cbc_encrypt: - cmpl $2,%edx - jb .Lenc_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_enc_shortcut - jmp .Lenc_non_avx -.align 16 -.Lenc_non_avx: movq %rsp,%rax pushq %rbx pushq %rbp @@ -270,14 +262,6 @@ aesni_multi_cbc_encrypt: .type aesni_multi_cbc_decrypt,@function .align 32 aesni_multi_cbc_decrypt: - cmpl $2,%edx - jb .Ldec_non_avx - movl OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_dec_shortcut - jmp .Ldec_non_avx -.align 16 -.Ldec_non_avx: movq %rsp,%rax pushq %rbx pushq %rbp @@ -520,916 +504,3 @@ aesni_multi_cbc_decrypt: .Ldec4x_epilogue: .byte 0xf3,0xc3 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt -.type aesni_multi_cbc_encrypt_avx,@function -.align 32 -aesni_multi_cbc_encrypt_avx: -_avx_cbc_enc_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - - - - - - - - subq $192,%rsp - andq $-128,%rsp - movq %rax,16(%rsp) - -.Lenc8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Lenc8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - testl %edx,%edx - jz .Lenc8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - - vpxor (%r8),%xmm15,%xmm10 - leaq 128(%rsp),%rbp - vpxor (%r9),%xmm15,%xmm11 - vpxor (%r10),%xmm15,%xmm12 - vpxor (%r11),%xmm15,%xmm13 - vpxor %xmm10,%xmm2,%xmm2 - vpxor (%r12),%xmm15,%xmm10 - vpxor %xmm11,%xmm3,%xmm3 - vpxor (%r13),%xmm15,%xmm11 - vpxor %xmm12,%xmm4,%xmm4 - vpxor (%r14),%xmm15,%xmm12 - vpxor %xmm13,%xmm5,%xmm5 - vpxor (%r15),%xmm15,%xmm13 - vpxor %xmm10,%xmm6,%xmm6 - movl $1,%ecx - vpxor %xmm11,%xmm7,%xmm7 - vpxor %xmm12,%xmm8,%xmm8 - vpxor %xmm13,%xmm9,%xmm9 - jmp .Loop_enc8x - -.align 32 -.Loop_enc8x: - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r8),%xmm15,%xmm10 - movq %rbx,64+0(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,0(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r9),%xmm15,%xmm11 - movq %rbx,64+8(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,16(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r10),%xmm15,%xmm12 - movq %rbx,64+16(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,32(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r11),%xmm15,%xmm13 - movq %rbx,64+24(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,48(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r12),%xmm15,%xmm10 - movq %rbx,64+32(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r13),%xmm15,%xmm11 - movq %rbx,64+40(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r14),%xmm15,%xmm12 - movq %rbx,64+48(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r15),%xmm15,%xmm13 - movq %rbx,64+56(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Lenc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Lenc8x_tail: - vaesenc %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesenc %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesenclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesenclast %xmm0,%xmm3,%xmm3 - vaesenclast %xmm0,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenclast %xmm0,%xmm5,%xmm5 - vaesenclast %xmm0,%xmm6,%xmm6 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesenclast %xmm0,%xmm7,%xmm7 - vaesenclast %xmm0,%xmm8,%xmm8 - vmovdqa %xmm14,48(%rsp) - vaesenclast %xmm0,%xmm9,%xmm9 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vpxor 0(%rbp),%xmm2,%xmm2 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vpxor 16(%rbp),%xmm3,%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vpxor 32(%rbp),%xmm4,%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vpxor 48(%rbp),%xmm5,%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vpxor %xmm10,%xmm6,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vpxor %xmm11,%xmm7,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vpxor %xmm12,%xmm8,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vpxor %xmm13,%xmm9,%xmm9 - - decl %edx - jnz .Loop_enc8x - - movq 16(%rsp),%rax - - - - - -.Lenc8x_done: - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lenc8x_epilogue: - .byte 0xf3,0xc3 -.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx - -.type aesni_multi_cbc_decrypt_avx,@function -.align 32 -aesni_multi_cbc_decrypt_avx: -_avx_cbc_dec_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - - - - - - - - - subq $256,%rsp - andq $-256,%rsp - subq $192,%rsp - movq %rax,16(%rsp) - -.Ldec8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -.Ldec8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - vmovdqu %xmm2,192(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - vmovdqu %xmm3,208(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - vmovdqu %xmm4,224(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - vmovdqu %xmm5,240(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - vmovdqu %xmm6,256(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - vmovdqu %xmm7,272(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - vmovdqu %xmm8,288(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - vmovdqu %xmm9,304(%rsp) - testl %edx,%edx - jz .Ldec8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - leaq 192+128(%rsp),%rbp - - vmovdqu (%r8),%xmm2 - vmovdqu (%r9),%xmm3 - vmovdqu (%r10),%xmm4 - vmovdqu (%r11),%xmm5 - vmovdqu (%r12),%xmm6 - vmovdqu (%r13),%xmm7 - vmovdqu (%r14),%xmm8 - vmovdqu (%r15),%xmm9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm6,64(%rbp) - vpxor %xmm15,%xmm6,%xmm6 - vmovdqu %xmm7,80(%rbp) - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm8,96(%rbp) - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu %xmm9,112(%rbp) - vpxor %xmm15,%xmm9,%xmm9 - xorq $128,%rbp - movl $1,%ecx - jmp .Loop_dec8x - -.align 32 -.Loop_dec8x: - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r8),%xmm10 - movq %rbx,64+0(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,128(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r9),%xmm11 - movq %rbx,64+8(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,144(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r10),%xmm12 - movq %rbx,64+16(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,160(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r11),%xmm13 - movq %rbx,64+24(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,176(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r12),%xmm10 - movq %rbx,64+32(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r13),%xmm11 - movq %rbx,64+40(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r14),%xmm12 - movq %rbx,64+48(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r15),%xmm13 - movq %rbx,64+56(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je .Ldec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -.Ldec8x_tail: - vaesdec %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesdec %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesdeclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesdeclast %xmm0,%xmm3,%xmm3 - vpxor 0(%rbp),%xmm2,%xmm2 - vaesdeclast %xmm0,%xmm4,%xmm4 - vpxor 16(%rbp),%xmm3,%xmm3 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdeclast %xmm0,%xmm5,%xmm5 - vpxor 32(%rbp),%xmm4,%xmm4 - vaesdeclast %xmm0,%xmm6,%xmm6 - vpxor 48(%rbp),%xmm5,%xmm5 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesdeclast %xmm0,%xmm7,%xmm7 - vpxor 64(%rbp),%xmm6,%xmm6 - vaesdeclast %xmm0,%xmm8,%xmm8 - vpxor 80(%rbp),%xmm7,%xmm7 - vmovdqa %xmm14,48(%rsp) - vaesdeclast %xmm0,%xmm9,%xmm9 - vpxor 96(%rbp),%xmm8,%xmm8 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vmovdqu 128+0(%rsp),%xmm2 - vpxor 112(%rbp),%xmm9,%xmm9 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu 128+16(%rsp),%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu 128+32(%rsp),%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu 128+48(%rsp),%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm10,64(%rbp) - vpxor %xmm10,%xmm15,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vmovdqu %xmm11,80(%rbp) - vpxor %xmm11,%xmm15,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vmovdqu %xmm12,96(%rbp) - vpxor %xmm12,%xmm15,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vmovdqu %xmm13,112(%rbp) - vpxor %xmm13,%xmm15,%xmm9 - - xorq $128,%rbp - decl %edx - jnz .Loop_dec8x - - movq 16(%rsp),%rax - - - - - -.Ldec8x_done: - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Ldec8x_epilogue: - .byte 0xf3,0xc3 -.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s index 88042248685372..edbd5cb343c327 100644 --- a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha1-x86_64.s @@ -10,11 +10,6 @@ aesni_cbc_sha1_enc: movq OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext - andl $268435456,%r11d - andl $1073741824,%r10d - orl %r11d,%r10d - cmpl $1342177280,%r10d - je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc @@ -1372,1304 +1367,6 @@ aesni_cbc_sha1_enc_ssse3: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 -.type aesni_cbc_sha1_enc_avx,@function -.align 32 -aesni_cbc_sha1_enc_avx: - movq 8(%rsp),%r10 - - - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -104(%rsp),%rsp - - - vzeroall - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - leaq 112(%rcx),%r15 - vmovdqu (%r8),%xmm12 - movq %r8,88(%rsp) - shlq $6,%r14 - subq %r12,%r13 - movl 240-112(%r15),%r8d - addq %r10,%r14 - - leaq K_XX_XX(%rip),%r11 - movl 0(%r9),%eax - movl 4(%r9),%ebx - movl 8(%r9),%ecx - movl 12(%r9),%edx - movl %ebx,%esi - movl 16(%r9),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r10 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm10,%xmm0,%xmm4 - vpaddd %xmm10,%xmm1,%xmm5 - vpaddd %xmm10,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - jmp .Loop_avx -.align 32 -.Loop_avx: - shrdl $2,%ebx,%ebx - vmovdqu 0(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm10,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm9 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpor %xmm8,%xmm4,%xmm4 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - vpxor %xmm9,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm10,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm9 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpor %xmm8,%xmm5,%xmm5 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm9,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa 16(%r11),%xmm10 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpaddd %xmm5,%xmm10,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm9 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpor %xmm8,%xmm6,%xmm6 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm9,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm10,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm9 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpor %xmm8,%xmm7,%xmm7 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - cmpl $11,%r8d - jb .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast6: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm9,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm10,%xmm9 - addl %esi,%edx - vmovdqu 16(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,0(%r12,%r13,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm10,%xmm9 - vmovdqa 32(%r11),%xmm10 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - vpaddd %xmm3,%xmm10,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm10,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast7: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - vmovdqu 32(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,16(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm10,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm10,%xmm9 - vmovdqa 48(%r11),%xmm10 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm10,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm10,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - cmpl $11,%r8d - jb .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast8: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm10,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vmovdqu 48(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,32(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm10,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r14,%r10 - je .Ldone_avx - vmovdqa 64(%r11),%xmm9 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm9,%xmm0,%xmm0 - addq $64,%r10 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm9,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm10,%xmm0,%xmm8 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm8,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm9,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm10,%xmm1,%xmm8 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm8,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm9,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm10,%xmm2,%xmm8 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm8,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast9: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - leaq 64(%r12),%r12 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - addl 12(%r9),%edx - movl %eax,0(%r9) - addl 16(%r9),%ebp - movl %esi,4(%r9) - movl %esi,%ebx - movl %ecx,8(%r9) - movl %ecx,%edi - movl %edx,12(%r9) - xorl %edx,%edi - movl %ebp,16(%r9) - andl %edi,%esi - jmp .Loop_avx - -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je .Lvaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -.Lvaesenclast10: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - movq 88(%rsp),%r8 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - movl %eax,0(%r9) - addl 12(%r9),%edx - movl %esi,4(%r9) - addl 16(%r9),%ebp - movl %ecx,8(%r9) - movl %edx,12(%r9) - movl %ebp,16(%r9) - vmovups %xmm12,(%r8) - vzeroall - leaq 104(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2695,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext: movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 jmp .Loop_shaext .align 16 @@ -2759,17 +1456,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb .Laesenclast11 + jb .Laesenclast6 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast11 + je .Laesenclast6 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast11: +.Laesenclast6: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2825,17 +1522,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb .Laesenclast12 + jb .Laesenclast7 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast12 + je .Laesenclast7 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast12: +.Laesenclast7: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 @@ -2891,17 +1588,17 @@ aesni_cbc_sha1_enc_shaext: pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb .Laesenclast13 + jb .Laesenclast8 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast13 + je .Laesenclast8 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast13: +.Laesenclast8: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2955,17 +1652,17 @@ aesni_cbc_sha1_enc_shaext: movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb .Laesenclast14 + jb .Laesenclast9 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast14 + je .Laesenclast9 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast14: +.Laesenclast9: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx @@ -2975,8 +1672,8 @@ aesni_cbc_sha1_enc_shaext: leaq 64(%rdi),%rdi jnz .Loop_shaext - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) diff --git a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s index 3df7f1bf5222a2..2c85f62495fe5e 100644 --- a/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/aes/aesni-sha256-x86_64.s @@ -5,25 +5,6 @@ .type aesni_cbc_sha256_enc,@function .align 16 aesni_cbc_sha256_enc: - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl $1,%eax - cmpq $0,%rdi - je .Lprobe - movl 0(%r11),%eax - movq 4(%r11),%r10 - btq $61,%r10 - jc aesni_cbc_sha256_enc_shaext - movq %r10,%r11 - shrq $32,%r11 - - testl $2048,%r10d - jnz aesni_cbc_sha256_enc_xop - andl $296,%r11d - cmpl $296,%r11d - je aesni_cbc_sha256_enc_avx2 - andl $268435456,%r10d - jnz aesni_cbc_sha256_enc_avx - ud2 xorl %eax,%eax cmpq $0,%rdi je .Lprobe @@ -74,4281 +55,3 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.type aesni_cbc_sha256_enc_xop,@function -.align 64 -aesni_cbc_sha256_enc_xop: -.Lxop_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -.Lprologue_xop: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm2,%xmm3,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm0,%xmm0 - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,251,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm3,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm0,%xmm0 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,248,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm0,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm0,%xmm0 - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm3,%xmm0,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm1,%xmm1 - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,248,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm0,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm1,%xmm1 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,249,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm1,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm1,%xmm1 - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm0,%xmm1,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm2,%xmm2 - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,249,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm1,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm2,%xmm2 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,250,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm2,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm2,%xmm2 - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm1,%xmm2,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm3,%xmm3 - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,250,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm2,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm3,%xmm3 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,251,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm3,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm3,%xmm3 - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lxop_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jb .Lloop_xop - - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop -.type aesni_cbc_sha256_enc_avx,@function -.align 64 -aesni_cbc_sha256_enc_avx: -.Lavx_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -.Lprologue_avx: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm3,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpaddd %xmm6,%xmm0,%xmm0 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm0,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 0(%rbp),%xmm0,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm0,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpaddd %xmm6,%xmm1,%xmm1 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm1,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 32(%rbp),%xmm1,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm1,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpaddd %xmm6,%xmm2,%xmm2 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm2,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 64(%rbp),%xmm2,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm2,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpaddd %xmm6,%xmm3,%xmm3 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm3,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 96(%rbp),%xmm3,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne .Lavx_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - jb .Lloop_avx - - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx -.type aesni_cbc_sha256_enc_avx2,@function -.align 64 -aesni_cbc_sha256_enc_avx2: -.Lavx2_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $576,%rsp - andq $-1024,%rsp - addq $448,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -.Lprologue_avx2: - vzeroall - - movq %rdi,%r13 - vpinsrq $1,%rsi,%xmm15,%xmm15 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r12 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - leaq -9(%r14),%r14 - - vmovdqa 0(%r12,%r14,8),%xmm14 - vmovdqa 16(%r12,%r14,8),%xmm13 - vmovdqa 32(%r12,%r14,8),%xmm12 - - subq $-64,%r13 - movl 0(%r15),%eax - leaq (%rsi,%r13,1),%r12 - movl 4(%r15),%ebx - cmpq %rdx,%r13 - movl 8(%r15),%ecx - cmoveq %rsp,%r12 - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - vmovdqu 0-128(%rdi),%xmm10 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi,%r13,1),%xmm0 - vmovdqu -64+16(%rsi,%r13,1),%xmm1 - vmovdqu -64+32(%rsi,%r13,1),%xmm2 - vmovdqu -64+48(%rsi,%r13,1),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - leaq -64(%r13),%r13 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - leaq -64(%rsp),%rsp - movl %ebx,%esi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%esi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - leaq -64(%rsp),%rsp - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm0,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm0,%ymm0 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 0(%rbp),%ymm0,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm1,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm1,%ymm1 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 32(%rbp),%ymm1,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm2,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm2,%ymm2 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 64(%rbp),%ymm2,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm3,%ymm7 - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm3,%ymm3 - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 96(%rbp),%ymm3,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne .Lavx2_00_47 - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vpextrq $1,%xmm15,%r12 - vmovq %xmm15,%r13 - movq 552(%rsp),%r15 - addl %r14d,%eax - leaq 448(%rsp),%rbp - - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r13),%r13 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - cmpq 80(%rbp),%r13 - je .Ldone_avx2 - - xorl %r14d,%r14d - movl %ebx,%esi - movl %r9d,%r12d - xorl %ecx,%esi - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - leaq -64(%rbp),%rbp - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 552(%rsp),%r15 - leaq 64(%r13),%r13 - movq 560(%rsp),%rsi - addl %r14d,%eax - leaq 448(%rsp),%rsp - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - leaq (%rsi,%r13,1),%r12 - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r13 - - movl %eax,0(%r15) - cmoveq %rsp,%r12 - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - -.Ldone_avx2: - leaq (%rbp),%rsp - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 -.type aesni_cbc_sha256_enc_shaext,@function -.align 32 -aesni_cbc_sha256_enc_shaext: - movq 8(%rsp),%r10 - leaq K256+128(%rip),%rax - movdqu (%r9),%xmm1 - movdqu 16(%r9),%xmm2 - movdqa 512-128(%rax),%xmm3 - - movl 240(%rcx),%r11d - subq %rdi,%rsi - movups (%rcx),%xmm15 - movups 16(%rcx),%xmm4 - leaq 112(%rcx),%rcx - - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%r10),%xmm10 - movdqu 16(%r10),%xmm11 - movdqu 32(%r10),%xmm12 -.byte 102,68,15,56,0,211 - movdqu 48(%r10),%xmm13 - - movdqa 0-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 102,68,15,56,0,219 - movdqa %xmm2,%xmm9 - movdqa %xmm1,%xmm8 - movups 0(%rdi),%xmm14 - xorps %xmm15,%xmm14 - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 32-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 102,68,15,56,0,227 - leaq 64(%r10),%r10 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 64-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 102,68,15,56,0,235 -.byte 69,15,56,204,211 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 96-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 -.byte 15,56,203,202 - movdqa 128-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - cmpl $11,%r11d - jb .Laesenclast1 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast1 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast1: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 16(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,0(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 160-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 192-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 224-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 256-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast2 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast2 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast2: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 32(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,16(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 288-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 320-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 352-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 384-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 416-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - cmpl $11,%r11d - jb .Laesenclast3 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast3 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast3: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups 48(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,32(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 448-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 - movdqa %xmm7,%xmm3 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 480-128(%rax),%xmm0 - paddd %xmm13,%xmm0 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb .Laesenclast4 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je .Laesenclast4 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.Laesenclast4: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop - - paddd %xmm9,%xmm2 - paddd %xmm8,%xmm1 - - decq %rdx - movups %xmm6,48(%rsi,%rdi,1) - leaq 64(%rdi),%rdi - jnz .Loop_shaext - - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm3 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,211,8 - - movups %xmm6,(%r8) - movdqu %xmm1,(%r9) - movdqu %xmm2,16(%r9) - .byte 0xf3,0xc3 -.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext diff --git a/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s b/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s index 8f356fc3d51a4e..d8b8bd8de5a4b3 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s +++ b/deps/openssl/asm/x64-elf-gas/bn/rsaz-avx2.s @@ -1,1632 +1,25 @@ .text +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +rsaz_avx2_eligible: + xorl %eax,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + .globl rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.globl rsaz_1024_norm2red_avx2 +.globl rsaz_1024_red2norm_avx2 +.globl rsaz_1024_scatter5_avx2 +.globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,@function -.align 64 rsaz_1024_sqr_avx2: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - movq %rax,%rbp - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz .Lsqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -.Lsqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vpbroadcastq .Land_mask(%rip),%ymm15 - jmp .LOOP_GRANDE_SQR_1024 - -.align 32 -.LOOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp .Lsqr_entry_1024 -.align 32 -.LOOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -.Lsqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz .LOOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $147,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $536870911,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $536870911,%eax - - movl $9,%r14d - jmp .LOOP_REDUCE_1024 - -.align 32 -.LOOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $536870911,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $536870911,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $536870911,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $536870911,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz .LOOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne .LOOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lsqr_1024_epilogue: - .byte 0xf3,0xc3 -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.type rsaz_1024_mul_avx2,@function -.align 64 rsaz_1024_mul_avx2: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rax,%rbp - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz .Lmul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -.Lmul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu .Land_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp .Loop_mul_1024 - -.align 32 -.Loop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $536870911,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $147,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - imull %r8d,%eax - andl $536870911,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $536870911,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $536870911,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz .Loop_mul_1024 - vpermq $0,%ymm15,%ymm15 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $147,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lmul_1024_epilogue: - .byte 0xf3,0xc3 -.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 -.globl rsaz_1024_red2norm_avx2 -.type rsaz_1024_red2norm_avx2,@function -.align 32 -rsaz_1024_red2norm_avx2: - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax - .byte 0xf3,0xc3 -.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 - -.globl rsaz_1024_norm2red_avx2 -.type rsaz_1024_norm2red_avx2,@function -.align 32 rsaz_1024_norm2red_avx2: - subq $-128,%rdi - movq (%rsi),%r8 - movl $536870911,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 -.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 -.globl rsaz_1024_scatter5_avx2 -.type rsaz_1024_scatter5_avx2,@function -.align 32 +rsaz_1024_red2norm_avx2: rsaz_1024_scatter5_avx2: - vzeroupper - vmovdqu .Lscatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp .Loop_scatter_1024 - -.align 32 -.Loop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz .Loop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 -.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 - -.globl rsaz_1024_gather5_avx2 -.type rsaz_1024_gather5_avx2,@function -.align 32 rsaz_1024_gather5_avx2: - leaq .Lgather_table(%rip),%r11 - movl %edx,%eax - andl $3,%edx - shrl $2,%eax - shll $4,%edx - - vmovdqu -32(%r11),%ymm7 - vpbroadcastb 8(%r11,%rax,1),%xmm8 - vpbroadcastb 7(%r11,%rax,1),%xmm9 - vpbroadcastb 6(%r11,%rax,1),%xmm10 - vpbroadcastb 5(%r11,%rax,1),%xmm11 - vpbroadcastb 4(%r11,%rax,1),%xmm12 - vpbroadcastb 3(%r11,%rax,1),%xmm13 - vpbroadcastb 2(%r11,%rax,1),%xmm14 - vpbroadcastb 1(%r11,%rax,1),%xmm15 - - leaq 64(%rsi,%rdx,1),%rsi - movq $64,%r11 - movl $9,%eax - jmp .Loop_gather_1024 - -.align 32 -.Loop_gather_1024: - vpand -64(%rsi),%xmm8,%xmm0 - vpand (%rsi),%xmm9,%xmm1 - vpand 64(%rsi),%xmm10,%xmm2 - vpand (%rsi,%r11,2),%xmm11,%xmm3 - vpor %xmm0,%xmm1,%xmm1 - vpand 64(%rsi,%r11,2),%xmm12,%xmm4 - vpor %xmm2,%xmm3,%xmm3 - vpand (%rsi,%r11,4),%xmm13,%xmm5 - vpor %xmm1,%xmm3,%xmm3 - vpand 64(%rsi,%r11,4),%xmm14,%xmm6 - vpor %xmm4,%xmm5,%xmm5 - vpand -128(%rsi,%r11,8),%xmm15,%xmm2 - leaq (%rsi,%r11,8),%rsi - vpor %xmm3,%xmm5,%xmm5 - vpor %xmm2,%xmm6,%xmm6 - vpor %xmm5,%xmm6,%xmm6 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqu %ymm6,(%rdi) - leaq 32(%rdi),%rdi - decl %eax - jnz .Loop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper +.byte 0x0f,0x0b .byte 0xf3,0xc3 -.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 - -.globl rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -.align 32 -rsaz_avx2_eligible: - movl OPENSSL_ia32cap_P+8(%rip),%eax - movl $524544,%ecx - movl $0,%edx - andl %eax,%ecx - cmpl $524544,%ecx - cmovel %edx,%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - -.align 64 -.Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 -.Lscatter_permd: -.long 0,2,4,6,7,7,7,7 -.Lgather_permd: -.long 0,7,1,7,2,7,3,7 -.Lgather_table: -.byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 -.align 64 +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 diff --git a/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s b/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s index f42075571e66b8..4a1211329c67ea 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/bn/rsaz-x86_64.s @@ -19,10 +19,6 @@ rsaz_512_sqr: movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) - movl $524544,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je .Loop_sqrx jmp .Loop_sqr .align 32 @@ -386,276 +382,6 @@ rsaz_512_sqr: decl %r8d jnz .Loop_sqr - jmp .Lsqr_tail - -.align 32 -.Loop_sqrx: - movl %r8d,128+8(%rsp) -.byte 102,72,15,110,199 -.byte 102,72,15,110,205 - - mulxq %rax,%r8,%r9 - - mulxq 16(%rsi),%rcx,%r10 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r11 - adcxq %rcx,%r9 - - mulxq 32(%rsi),%rcx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rcx,%r11 - -.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r12 - adcxq %rcx,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adcxq %rbp,%r15 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shlq $1,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rdx,%r8 - movq 8(%rsi),%rdx - adcxq %rbp,%r9 - - movq %rax,(%rsp) - movq %r8,8(%rsp) - - - mulxq 16(%rsi),%rax,%rbx - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %r8,%r12 - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - - mulxq 40(%rsi),%rdi,%r8 - adoxq %rdi,%r13 - adcxq %r8,%r14 - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adoxq %rdi,%r15 - adcxq %rbp,%r8 - adoxq %rbp,%r8 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rcx - movq 16(%rsi),%rdx - adcxq %rax,%r9 - adcxq %rcx,%r10 - adcxq %rbp,%r11 - - movq %r9,16(%rsp) -.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 - adoxq %rdi,%r12 - adcxq %r9,%r13 - - mulxq 32(%rsi),%rax,%rcx - adoxq %rax,%r13 - adcxq %rcx,%r14 - - mulxq 40(%rsi),%rdi,%r9 - adoxq %rdi,%r14 - adcxq %r9,%r15 - -.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adoxq %rax,%r15 - adcxq %rcx,%r8 - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %rbp,%r9 - - movq %r13,%rcx - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 24(%rsi),%rdx - adcxq %rbp,%r13 - - movq %r11,32(%rsp) -.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - - mulxq 40(%rsi),%rdi,%r10 - adoxq %rdi,%r15 - adcxq %r10,%r8 - - mulxq 48(%rsi),%rax,%rbx - adoxq %rax,%r8 - adcxq %rbx,%r9 - - mulxq 56(%rsi),%rdi,%r10 - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %rbp,%r10 - -.byte 0x66 - movq %r15,%rbx - shldq $1,%r14,%r15 - shldq $1,%rcx,%r14 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r13 - adcxq %rdx,%r14 - movq 32(%rsi),%rdx - adcxq %rbp,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - - -.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %r11,%r9 - - mulxq 48(%rsi),%rax,%rcx - adoxq %rax,%r9 - adcxq %rcx,%r10 - - mulxq 56(%rsi),%rdi,%r11 - adoxq %rdi,%r10 - adcxq %rbp,%r11 - adoxq %rbp,%r11 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shldq $1,%rbx,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r15 - adcxq %rdx,%r8 - movq 40(%rsi),%rdx - adcxq %rbp,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %rbp,%r12 - adoxq %rbp,%r12 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r9 - adcxq %rdx,%r10 - movq 48(%rsi),%rdx - adcxq %rbp,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adoxq %rax,%r12 - adoxq %rbp,%r13 - - xorq %r14,%r14 - shldq $1,%r13,%r14 - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 56(%rsi),%rdx - adcxq %rbp,%r13 - -.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulxq %rdx,%rax,%rdx - adoxq %rax,%r13 - adoxq %rbp,%rdx - -.byte 0x66 - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz .Loop_sqrx - -.Lsqr_tail: leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -684,10 +410,6 @@ rsaz_512_mul: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) - movl $524544,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -705,29 +427,6 @@ rsaz_512_mul: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_tail - -.align 32 -.Lmulx: - movq %rdx,%rbp - movq (%rdx),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex -.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -762,52 +461,94 @@ rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp .Lmul_gather4_body: - movl $524544,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je .Lmulx_gather - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -820,14 +561,12 @@ rsaz_512_mul_gather4: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -839,6 +578,35 @@ rsaz_512_mul_gather4: .align 32 .Loop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -847,7 +615,6 @@ rsaz_512_mul_gather4: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -856,7 +623,6 @@ rsaz_512_mul_gather4: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -865,7 +631,6 @@ rsaz_512_mul_gather4: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -874,7 +639,6 @@ rsaz_512_mul_gather4: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -899,7 +663,6 @@ rsaz_512_mul_gather4: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -907,7 +670,6 @@ rsaz_512_mul_gather4: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -922,8 +684,8 @@ rsaz_512_mul_gather4: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -935,126 +697,6 @@ rsaz_512_mul_gather4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_gather_tail - -.align 32 -.Lmulx_gather: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - leaq 128(%rdx,%r9,4),%rbp - movl (%rdx,%r9,4),%edx -.byte 102,72,15,110,201 - movq %r8,128(%rsp) - - shlq $32,%rax - orq %rax,%rdx - mulxq (%rsi),%rbx,%r8 - movq %rbx,(%rsp) - xorl %edi,%edi - - mulxq 8(%rsi),%rax,%r9 - movd (%rbp),%xmm4 - - mulxq 16(%rsi),%rbx,%r10 - movd 64(%rbp),%xmm5 - adcxq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - pslldq $4,%xmm5 - adcxq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - por %xmm5,%xmm4 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - leaq 128(%rbp),%rbp - adcxq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 -.byte 102,72,15,126,226 - adcxq %rbx,%r13 - adcxq %rax,%r14 - movq %r8,%rbx - adcxq %rdi,%r15 - - movq $-7,%rcx - jmp .Loop_mulx_gather - -.align 32 -.Loop_mulx_gather: - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 -.byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - movd 64(%rbp),%xmm5 - leaq 128(%rbp),%rbp - adcxq %rax,%r9 - adoxq %r11,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - pslldq $4,%xmm5 - por %xmm5,%xmm4 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 -.byte 102,72,15,126,226 - movq %rbx,64(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - movq %r8,%rbx - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx_gather - - movq %r8,64(%rsp) - movq %r9,64+8(%rsp) - movq %r10,64+16(%rsp) - movq %r11,64+24(%rsp) - movq %r12,64+32(%rsp) - movq %r13,64+40(%rsp) - movq %r14,64+48(%rsp) - movq %r15,64+56(%rsp) - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1092,17 +734,13 @@ rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp .Lmul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 movq %rcx,128(%rsp) movq %rdi,%rbp - movl $524544,%r11d - andl OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -1119,29 +757,6 @@ rsaz_512_mul_scatter4: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp .Lmul_scatter_tail - -.align 32 -.Lmulx_scatter: - movq (%rdi),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1155,30 +770,14 @@ rsaz_512_mul_scatter4: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1204,7 +803,6 @@ rsaz_512_mul_by_one: subq $128+24,%rsp .Lmul_by_one_body: - movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -1225,16 +823,7 @@ rsaz_512_mul_by_one: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) - andl $524544,%eax - cmpl $524544,%eax - je .Lby_one_callx call __rsaz_512_reduce - jmp .Lby_one_tail -.align 32 -.Lby_one_callx: - movq 128(%rsp),%rdx - call __rsaz_512_reducex -.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1338,62 +927,6 @@ __rsaz_512_reduce: .byte 0xf3,0xc3 .size __rsaz_512_reduce,.-__rsaz_512_reduce -.type __rsaz_512_reducex,@function -.align 32 -__rsaz_512_reducex: - - imulq %r8,%rdx - xorq %rsi,%rsi - movl $8,%ecx - jmp .Lreduction_loopx - -.align 32 -.Lreduction_loopx: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 128+8(%rsp),%rbx,%rdx - movq %rax,%rdx - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - - decl %ecx - jne .Lreduction_loopx - - .byte 0xf3,0xc3 -.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: @@ -1593,140 +1126,18 @@ __rsaz_512_mul: .byte 0xf3,0xc3 .size __rsaz_512_mul,.-__rsaz_512_mul -.type __rsaz_512_mulx,@function -.align 32 -__rsaz_512_mulx: - mulxq (%rsi),%rbx,%r8 - movq $-6,%rcx - - mulxq 8(%rsi),%rax,%r9 - movq %rbx,8(%rsp) - - mulxq 16(%rsi),%rbx,%r10 - adcq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - movq 8(%rbp),%rdx - adcq %rbx,%r13 - adcq %rax,%r14 - adcq $0,%r15 - - xorq %rdi,%rdi - jmp .Loop_mulx - -.align 32 -.Loop_mulx: - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rsi),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq 64(%rbp,%rcx,8),%rdx - movq %rbx,8+64-8(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - incq %rcx - jnz .Loop_mulx - - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - -.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - -.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - movq %rbx,8+64-8(%rsp) - movq %r8,8+64(%rsp) - movq %r9,8+64+8(%rsp) - movq %r10,8+64+16(%rsp) - movq %r11,8+64+24(%rsp) - movq %r12,8+64+32(%rsp) - movq %r13,8+64+40(%rsp) - movq %r14,8+64+48(%rsp) - movq %r15,8+64+56(%rsp) - - .byte 0xf3,0xc3 -.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter @@ -1737,19 +1148,72 @@ rsaz_512_scatter4: .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s index eed057ad6a1a8a..f4e5337565bbc7 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-gf2m.s @@ -242,7 +242,7 @@ bn_GF2m_mul_2x2: movq %rcx,56(%rsp) movq %r8,64(%rsp) - movq $15,%r8 + movq $0xf,%r8 movq %rsi,%rax movq %rcx,%rbp call _mul_1x1 diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s index 45d19cd8b5fc2e..9e0019c163771c 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont.s @@ -10,7 +10,6 @@ bn_mul_mont: jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter - movl OPENSSL_ia32cap_P+8(%rip),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d @@ -216,9 +215,6 @@ bn_mul_mont: .align 16 bn_mul4x_mont: .Lmul4x_enter: - andl $524544,%r11d - cmpl $524544,%r11d - je .Lmulx4x_enter pushq %rbx pushq %rbp pushq %r12 @@ -615,7 +611,6 @@ bn_mul4x_mont: .size bn_mul4x_mont,.-bn_mul4x_mont - .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: @@ -638,20 +633,20 @@ bn_sqr8x_mont: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -661,385 +656,81 @@ bn_sqr8x_mont: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) .Lsqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz .Lsqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 - andl $524544,%eax - cmpl $524544,%eax - jne .Lsqr8x_nox - - call bn_sqrx8x_internal - - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp .Lsqr8x_zero - -.align 32 -.Lsqr8x_nox: call bn_sqr8x_internal - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp .Lsqr8x_zero - -.align 32 -.Lsqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz .Lsqr8x_zero - - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lsqr8x_epilogue: - .byte 0xf3,0xc3 -.size bn_sqr8x_mont,.-bn_sqr8x_mont -.type bn_mulx4x_mont,@function -.align 32 -bn_mulx4x_mont: -.Lmulx4x_enter: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - shll $3,%r9d -.byte 0x67 - xorq %r10,%r10 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rsp - leaq (%rdx,%r9,1),%r10 - andq $-128,%rsp - - - - - - - - - - movq %r9,0(%rsp) - shrq $5,%r9 - movq %r10,16(%rsp) - subq $1,%r9 - movq %r8,24(%rsp) - movq %rdi,32(%rsp) - movq %rax,40(%rsp) - movq %r9,48(%rsp) - jmp .Lmulx4x_body - -.align 32 -.Lmulx4x_body: - leaq 8(%rdx),%rdi - movq (%rdx),%rdx - leaq 64+32(%rsp),%rbx - movq %rdx,%r9 - - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r14 - addq %rax,%r11 - movq %rdi,8(%rsp) - mulxq 16(%rsi),%r12,%r13 - adcq %r14,%r12 - adcq $0,%r13 - - movq %r8,%rdi - imulq 24(%rsp),%r8 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%rdi - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 -.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - movq 48(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - - jmp .Lmulx4x_1st - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - addq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - movq (%rdi),%rdx - leaq 8(%rdi),%rdi - subq %rax,%rsi - movq %r15,(%rbx) - leaq 64+32(%rsp),%rbx - subq %rax,%rcx - - mulxq 0(%rsi),%r8,%r11 - xorl %ebp,%ebp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - adoxq %rbp,%r12 - adcxq %rbp,%r13 - - movq %rdi,8(%rsp) -.byte 0x67 - movq %r8,%r15 - imulq 24(%rsp),%r8 - xorl %ebp,%ebp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - adoxq -16(%rbx),%r12 - adcxq %rax,%r13 - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - adoxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx movq %r9,%rdx - movq %r11,-24(%rbx) - leaq 32(%rcx),%rcx - adcxq %rax,%r12 - adoxq %rbp,%r15 - movq 48(%rsp),%rdi - movq %r12,-16(%rbx) - - jmp .Lmulx4x_inner +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub .align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-32(%rbx) - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - subq 0(%rbx),%rbp - adcq %r15,%r14 - movq -8(%rcx),%r8 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - - cmpq 16(%rsp),%rdi - jne .Lmulx4x_outer - - subq %r14,%r8 - sbbq %r8,%r8 - orq %r8,%r15 - - negq %rax - xorq %rdx,%rdx - movq 32(%rsp),%rdi - leaq 64(%rsp),%rbx + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - movq 0(%rcx,%rax,1),%r8 - movq 8(%rcx,%rax,1),%r9 - negq %r8 - jmp .Lmulx4x_sub_entry + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + jmp .Lsqr8x_cond_copy .align 32 -.Lmulx4x_sub: - movq 0(%rcx,%rax,1),%r8 - movq 8(%rcx,%rax,1),%r9 - notq %r8 -.Lmulx4x_sub_entry: - movq 16(%rcx,%rax,1),%r10 - notq %r9 - andq %r15,%r8 - movq 24(%rcx,%rax,1),%r11 - notq %r10 - andq %r15,%r9 - notq %r11 - andq %r15,%r10 - andq %r15,%r11 - - negq %rdx - adcq 0(%rbx),%r8 - adcq 8(%rbx),%r9 - movdqa %xmm0,(%rbx) - adcq 16(%rbx),%r10 - adcq 24(%rbx),%r11 - movdqa %xmm0,16(%rbx) +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx - sbbq %rdx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy - addq $32,%rax - jnz .Lmulx4x_sub - - movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 @@ -1048,8 +739,8 @@ bn_mulx4x_mont: movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp -.Lmulx4x_epilogue: +.Lsqr8x_epilogue: .byte 0xf3,0xc3 -.size bn_mulx4x_mont,.-bn_mulx4x_mont +.size bn_sqr8x_mont,.-bn_sqr8x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s index 41e96c8e90e262..8afe2496952006 100644 --- a/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm/x64-elf-gas/bn/x86_64-mont5.s @@ -8,53 +8,157 @@ bn_mul_mont_gather5: testl $7,%r9d jnz .Lmul_enter - movl OPENSSL_ia32cap_P+8(%rip),%r11d jmp .Lmul4x_enter .align 16 .Lmul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq .Linc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) .Lmul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -63,29 +167,14 @@ bn_mul_mont_gather5: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -118,14 +207,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .L1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -139,33 +226,78 @@ bn_mul_mont_gather5: jmp .Louter .align 16 .Louter: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -201,15 +333,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .Linner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -256,6 +385,7 @@ bn_mul_mont_gather5: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -270,9 +400,6 @@ bn_mul_mont_gather5: .align 32 bn_mul4x_mont_gather5: .Lmul4x_enter: - andl $524544,%r11d - cmpl $524544,%r11d - je .Lmulx4x_enter .byte 0x67 movq %rsp,%rax pushq %rbx @@ -281,10 +408,10 @@ bn_mul4x_mont_gather5: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -294,19 +421,21 @@ bn_mul4x_mont_gather5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -322,6 +451,7 @@ bn_mul4x_mont_gather5: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -337,47 +467,141 @@ bn_mul4x_mont_gather5: .align 32 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 -.byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 -.byte 0x67 - por %xmm2,%xmm0 - movq -32(%r14),%xmm2 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -391,26 +615,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -419,7 +627,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -429,7 +637,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -439,7 +647,7 @@ mul4x_internal: .L1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -455,7 +663,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -485,7 +693,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -494,7 +702,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -504,7 +712,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -520,7 +728,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -533,8 +741,7 @@ mul4x_internal: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -545,6 +752,63 @@ mul4x_internal: .align 32 .Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -552,25 +816,11 @@ mul4x_internal: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -580,7 +830,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -592,7 +842,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x @@ -601,7 +851,7 @@ mul4x_internal: .Linner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -619,7 +869,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -653,7 +903,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -664,7 +914,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -674,7 +924,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -693,7 +943,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -708,9 +958,8 @@ mul4x_internal: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -721,25 +970,28 @@ mul4x_internal: cmpq 16+8(%rsp),%r12 jb .Louter4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry .size mul4x_internal,.-mul4x_internal .globl bn_power5 .type bn_power5,@function .align 32 bn_power5: - movl OPENSSL_ia32cap_P+8(%rip),%r11d - andl $524544,%r11d - cmpl $524544,%r11d - je .Lpowerx5_enter movq %rsp,%rax pushq %rbx pushq %rbp @@ -747,9 +999,9 @@ bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -759,19 +1011,20 @@ bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -799,10 +1052,15 @@ bn_power5: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1346,9 +1604,9 @@ __bn_sqr8x_internal: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1381,14 +1639,14 @@ sqr8x_reduction: .align 32 .L8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1397,7 +1655,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1406,7 +1664,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1415,7 +1673,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1423,7 +1681,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1431,7 +1689,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1449,7 +1707,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1475,14 +1733,14 @@ sqr8x_reduction: .L8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1491,7 +1749,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1499,7 +1757,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1507,7 +1765,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1515,7 +1773,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1523,7 +1781,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1541,7 +1799,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done @@ -1587,7 +1845,7 @@ sqr8x_reduction: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1605,44 +1863,62 @@ sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop - - subq %r15,%rcx + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function +.align 32 +__bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi movq %r9,%rcx - orq %rsi,%rax .byte 102,72,15,126,207 - xorq $1,%rax + negq %rax .byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp sarq $3+2,%rcx - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .type bn_from_montgomery,@function .align 32 @@ -1664,10 +1940,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1677,19 +1952,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1740,22 +2016,8 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - movl OPENSSL_ia32cap_P+8(%rip),%r11d - andl $524544,%r11d - cmpl $524544,%r11d - jne .Lfrom_mont_nox - - leaq (%rax,%r9,1),%rdi - call sqrx8x_reduction - - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - movq 40(%rsp),%rsi - jmp .Lfrom_mont_zero - -.align 32 -.Lfrom_mont_nox: - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1783,1197 +2045,208 @@ bn_from_mont8x: .Lfrom_epilogue: .byte 0xf3,0xc3 .size bn_from_mont8x,.-bn_from_mont8x -.type bn_mulx4x_mont_gather5,@function -.align 32 -bn_mulx4x_mont_gather5: -.Lmulx4x_enter: -.byte 0x67 - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -.byte 0x67 - movl %r9d,%r10d - shll $3,%r9d - shll $3+2,%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lmulx4xsp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp - jmp .Lmulx4xsp_done - -.align 32 -.Lmulx4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rsp -.Lmulx4xsp_done: - andq $-64,%rsp - - - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.Lmulx4x_body: - call mulx4x_internal +.globl bn_get_bits5 +.type bn_get_bits5,@function +.align 16 +bn_get_bits5: + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 + movl %esi,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax + shrl %cl,%eax + andl $31,%eax + .byte 0xf3,0xc3 +.size bn_get_bits5,.-bn_get_bits5 - movq 40(%rsp),%rsi - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lmulx4x_epilogue: +.globl bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: + cmpl $0,%esi + jz .Lscatter_epilogue + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz .Lscatter +.Lscatter_epilogue: .byte 0xf3,0xc3 -.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 +.size bn_scatter5,.-bn_scatter5 -.type mulx4x_internal,@function +.globl bn_gather5 +.type bn_gather5,@function .align 32 -mulx4x_internal: -.byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 -.byte 0x67 - negq %r9 - shlq $5,%r9 - leaq 256(%rdx,%r9,1),%r13 - shrq $5+5,%r9 - movl 8(%rax),%r10d - subq $1,%r9 - movq %r13,16+8(%rsp) - movq %r9,24+8(%rsp) - movq %rdi,56+8(%rsp) - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%rdi - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%rdi),%xmm0 - leaq 256(%rdi),%rbx - movq -32(%rdi),%xmm1 - pand %xmm4,%xmm0 - movq 32(%rdi),%xmm2 - pand %xmm5,%xmm1 - movq 96(%rdi),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq -96(%rbx),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq -32(%rbx),%xmm2 - por %xmm3,%xmm0 -.byte 0x67,0x67 - pand %xmm4,%xmm1 - movq 32(%rbx),%xmm3 - -.byte 102,72,15,126,194 - movq 96(%rbx),%xmm0 - leaq 512(%rdi),%rdi - pand %xmm5,%xmm2 -.byte 0x67,0x67 - pand %xmm6,%xmm3 - - - +bn_gather5: +.LSEH_begin_bn_gather5: + +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp .Lgather +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz .Lgather - - - leaq 64+32+8(%rsp,%r11,8),%rbx - - movq %rdx,%r9 - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r12 - addq %rax,%r11 - mulxq 16(%rsi),%rax,%r13 - adcq %rax,%r12 - adcq $0,%r13 - mulxq 24(%rsi),%rax,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - xorq %rbp,%rbp - movq %r8,%rdx - - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - movq %rdi,8+8(%rsp) - por %xmm1,%xmm0 - -.byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 16(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 32(%rcx),%rax,%r12 - movq 24+8(%rsp),%rdi -.byte 0x66 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 48(%rcx),%rax,%r15 -.byte 0x67,0x67 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 -.byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 - movq %r12,-16(%rbx) - - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 16(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 32(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 64(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 8(%rsp),%rax -.byte 102,72,15,126,194 - adcq %rbp,%r15 - leaq (%rsi,%rax,1),%rsi - addq %r15,%r14 - movq 8+8(%rsp),%rdi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - movq %rbp,(%rbx) - leaq 32(%rbx,%rax,1),%rbx - mulxq 0(%rsi),%r8,%r11 - xorq %rbp,%rbp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - mulxq 24(%rsi),%rdx,%r14 - adoxq -16(%rbx),%r12 - adcxq %rdx,%r13 - leaq (%rcx,%rax,2),%rcx - leaq 32(%rsi),%rsi - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - adoxq %rbp,%r14 - -.byte 0x67 - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - - movq -96(%rdi),%xmm0 -.byte 0x67,0x67 - movq %r8,%rdx - movq -32(%rdi),%xmm1 -.byte 0x67 - pand %xmm4,%xmm0 - movq 32(%rdi),%xmm2 -.byte 0x67 - pand %xmm5,%xmm1 - movq 96(%rdi),%xmm3 - addq $256,%rdi -.byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - xorq %rbp,%rbp - movq %rdi,8+8(%rsp) - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 16(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 32(%rcx),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - por %xmm2,%xmm0 - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - por %xmm3,%xmm0 - adcxq %rax,%r12 - movq %r11,-24(%rbx) - adoxq %rbp,%r15 - movq %r12,-16(%rbx) - leaq 64(%rcx),%rcx - jmp .Lmulx4x_inner - -.align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 16(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 32(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - movq %r11,-32(%rbx) - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - leaq 64(%rcx),%rcx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner - - movq 0+8(%rsp),%rax -.byte 102,72,15,126,194 - adcq %rbp,%r15 - subq 0(%rbx),%rdi - movq 8+8(%rsp),%rdi - movq 16+8(%rsp),%r10 - adcq %r15,%r14 - leaq (%rsi,%rax,1),%rsi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - - cmpq %r10,%rdi - jb .Lmulx4x_outer - - movq -16(%rcx),%r10 - xorq %r15,%r15 - subq %r14,%r10 - adcq %r15,%r15 - orq %r15,%rbp - xorq $1,%rbp - leaq (%rbx,%rax,1),%rdi - leaq (%rcx,%rax,2),%rcx -.byte 0x67,0x67 - sarq $3+2,%rax - leaq (%rcx,%rbp,8),%rbp - movq 56+8(%rsp),%rdx - movq %rax,%rcx - jmp .Lsqrx4x_sub -.size mulx4x_internal,.-mulx4x_internal -.type bn_powerx5,@function -.align 32 -bn_powerx5: -.Lpowerx5_enter: -.byte 0x67 - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -.byte 0x67 - movl %r9d,%r10d - shll $3,%r9d - shll $3+2,%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lpwrx_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp - jmp .Lpwrx_sp_done - -.align 32 -.Lpwrx_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rsp -.Lpwrx_sp_done: - andq $-64,%rsp - movq %r9,%r10 - negq %r9 - - - - - - - - - - - - - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.Lpowerx5_body: - - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - - movq %r10,%r9 - movq %rsi,%rdi -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq 40(%rsp),%rax - - call mulx4x_internal - - movq 40(%rsp),%rsi - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lpowerx5_epilogue: - .byte 0xf3,0xc3 -.size bn_powerx5,.-bn_powerx5 - -.globl bn_sqrx8x_internal -.hidden bn_sqrx8x_internal -.type bn_sqrx8x_internal,@function -.align 32 -bn_sqrx8x_internal: -__bn_sqrx8x_internal: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 48+8(%rsp),%rdi - leaq (%rsi,%r9,1),%rbp - movq %r9,0+8(%rsp) - movq %rbp,8+8(%rsp) - jmp .Lsqr8x_zero_start - -.align 32 -.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -.Lsqrx8x_zero: -.byte 0x3e - movdqa %xmm0,0(%rdi) - movdqa %xmm0,16(%rdi) - movdqa %xmm0,32(%rdi) - movdqa %xmm0,48(%rdi) -.Lsqr8x_zero_start: - movdqa %xmm0,64(%rdi) - movdqa %xmm0,80(%rdi) - movdqa %xmm0,96(%rdi) - movdqa %xmm0,112(%rdi) - leaq 128(%rdi),%rdi - subq $64,%r9 - jnz .Lsqrx8x_zero - - movq 0(%rsi),%rdx - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - leaq 48+8(%rsp),%rdi - xorq %rbp,%rbp - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_loop: - mulxq 8(%rsi),%r8,%rax - adcxq %r9,%r8 - adoxq %rax,%r10 - mulxq 16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 -.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcxq %r11,%r10 - adoxq %rax,%r12 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcxq %r12,%r11 - adoxq %rax,%r13 - mulxq 40(%rsi),%r12,%rax - adcxq %r13,%r12 - adoxq %rax,%r14 - mulxq 48(%rsi),%r13,%rax - adcxq %r14,%r13 - adoxq %r15,%rax - mulxq 56(%rsi),%r14,%r15 - movq 8(%rsi),%rdx - adcxq %rax,%r14 - adoxq %rbp,%r15 - adcq 64(%rdi),%r15 - movq %r8,8(%rdi) - movq %r9,16(%rdi) - sbbq %rcx,%rcx - xorq %rbp,%rbp - - - mulxq 16(%rsi),%r8,%rbx - mulxq 24(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 32(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %rbx,%r11 -.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcxq %r13,%r11 - adoxq %r14,%r12 -.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - movq 16(%rsi),%rdx - adcxq %rax,%r12 - adoxq %rbx,%r13 - adcxq %r15,%r13 - adoxq %rbp,%r14 - adcxq %rbp,%r14 - - movq %r8,24(%rdi) - movq %r9,32(%rdi) - - mulxq 24(%rsi),%r8,%rbx - mulxq 32(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 40(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %r13,%r11 -.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -.byte 0x3e - movq 24(%rsi),%rdx - adcxq %rbx,%r11 - adoxq %rax,%r12 - adcxq %r14,%r12 - movq %r8,40(%rdi) - movq %r9,48(%rdi) - mulxq 32(%rsi),%r8,%rax - adoxq %rbp,%r13 - adcxq %rbp,%r13 - - mulxq 40(%rsi),%r9,%rbx - adcxq %r10,%r8 - adoxq %rax,%r9 - mulxq 48(%rsi),%r10,%rax - adcxq %r11,%r9 - adoxq %r12,%r10 - mulxq 56(%rsi),%r11,%r12 - movq 32(%rsi),%rdx - movq 40(%rsi),%r14 - adcxq %rbx,%r10 - adoxq %rax,%r11 - movq 48(%rsi),%r15 - adcxq %r13,%r11 - adoxq %rbp,%r12 - adcxq %rbp,%r12 - - movq %r8,56(%rdi) - movq %r9,64(%rdi) - - mulxq %r14,%r9,%rax - movq 56(%rsi),%r8 - adcxq %r10,%r9 - mulxq %r15,%r10,%rbx - adoxq %rax,%r10 - adcxq %r11,%r10 - mulxq %r8,%r11,%rax - movq %r14,%rdx - adoxq %rbx,%r11 - adcxq %r12,%r11 - - adcxq %rbp,%rax - - mulxq %r15,%r14,%rbx - mulxq %r8,%r12,%r13 - movq %r15,%rdx - leaq 64(%rsi),%rsi - adcxq %r14,%r11 - adoxq %rbx,%r12 - adcxq %rax,%r12 - adoxq %rbp,%r13 - -.byte 0x67,0x67 - mulxq %r8,%r8,%r14 - adcxq %r8,%r13 - adcxq %rbp,%r14 - - cmpq 8+8(%rsp),%rsi - je .Lsqrx8x_outer_break - - negq %rcx - movq $-8,%rcx - movq %rbp,%r15 - movq 64(%rdi),%r8 - adcxq 72(%rdi),%r9 - adcxq 80(%rdi),%r10 - adcxq 88(%rdi),%r11 - adcq 96(%rdi),%r12 - adcq 104(%rdi),%r13 - adcq 112(%rdi),%r14 - adcq 120(%rdi),%r15 - leaq (%rsi),%rbp - leaq 128(%rdi),%rdi - sbbq %rax,%rax - - movq -64(%rsi),%rdx - movq %rax,16+8(%rsp) - movq %rdi,24+8(%rsp) - - - xorl %eax,%eax - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_loop: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - movq %rbx,(%rdi,%rcx,8) - movl $0,%ebx - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - movq 8(%rsi,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rbx,%r15 - adcxq %rbx,%r15 - -.byte 0x67 - incq %rcx - jnz .Lsqrx8x_loop - - leaq 64(%rbp),%rbp - movq $-8,%rcx - cmpq 8+8(%rsp),%rbp - je .Lsqrx8x_break - - subq 16+8(%rsp),%rbx -.byte 0x66 - movq -64(%rsi),%rdx - adcxq 0(%rdi),%r8 - adcxq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi -.byte 0x67 - sbbq %rax,%rax - xorl %ebx,%ebx - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_break: - subq 16+8(%rsp),%r8 - movq 24+8(%rsp),%rcx - movq 0(%rsi),%rdx - xorl %ebp,%ebp - movq %r8,0(%rdi) - cmpq %rcx,%rdi - je .Lsqrx8x_outer_loop - - movq %r9,8(%rdi) - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - movq 40(%rcx),%r13 - movq %r14,48(%rdi) - movq 48(%rcx),%r14 - movq %r15,56(%rdi) - movq 56(%rcx),%r15 - movq %rcx,%rdi - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_break: - movq %r9,72(%rdi) -.byte 102,72,15,126,217 - movq %r10,80(%rdi) - movq %r11,88(%rdi) - movq %r12,96(%rdi) - movq %r13,104(%rdi) - movq %r14,112(%rdi) - leaq 48+8(%rsp),%rdi - movq (%rsi,%rcx,1),%rdx - - movq 8(%rdi),%r11 - xorq %r10,%r10 - movq 0+8(%rsp),%r9 - adoxq %r11,%r11 - movq 16(%rdi),%r12 - movq 24(%rdi),%r13 - - -.align 32 -.Lsqrx4x_shift_n_add: - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax -.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 40(%rdi),%r11 - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - movq 16(%rsi,%rcx,1),%rdx - movq 48(%rdi),%r12 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 56(%rdi),%r13 - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax - movq 24(%rsi,%rcx,1),%rdx - leaq 32(%rcx),%rcx - movq 64(%rdi),%r10 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 72(%rdi),%r11 - movq %rax,32(%rdi) - movq %rbx,40(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - jrcxz .Lsqrx4x_shift_n_add_break -.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 80(%rdi),%r12 - movq 88(%rdi),%r13 - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi - nop - jmp .Lsqrx4x_shift_n_add - -.align 32 -.Lsqrx4x_shift_n_add_break: - adcxq %r13,%rbx - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi -.byte 102,72,15,126,213 -sqrx8x_reduction: - xorl %eax,%eax - movq 32+8(%rsp),%rbx - movq 48+8(%rsp),%rdx - leaq -128(%rbp,%r9,2),%rcx - - movq %rcx,0+8(%rsp) - movq %rdi,8+8(%rsp) - - leaq 48+8(%rsp),%rdi - jmp .Lsqrx8x_reduction_loop - -.align 32 -.Lsqrx8x_reduction_loop: - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq %rdx,%r8 - imulq %rbx,%rdx - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,24+8(%rsp) - - leaq 64(%rdi),%rdi - xorq %rsi,%rsi - movq $-8,%rcx - jmp .Lsqrx8x_reduce - -.align 32 -.Lsqrx8x_reduce: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 16(%rbp),%rbx,%r9 - adcxq %rbx,%r8 - adoxq %r10,%r9 - - mulxq 32(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 48(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 32+8(%rsp),%rbx,%rdx - movq %rax,%rdx - movq %rax,64+48+8(%rsp,%rcx,8) - - mulxq 80(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 96(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 112(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - -.byte 0x67,0x67,0x67 - incq %rcx - jnz .Lsqrx8x_reduce - - movq %rsi,%rax - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_no_tail - - movq 48+8(%rsp),%rdx - addq 0(%rdi),%r8 - leaq 128(%rbp),%rbp - movq $-8,%rcx - adcxq 8(%rdi),%r9 - adcxq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 16(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 32(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 48(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 80(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 96(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 112(%rbp),%rax,%r15 - movq 72+48+8(%rsp,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - movq %rbx,(%rdi,%rcx,8) - movq %r8,%rbx - adcxq %rsi,%r15 - - incq %rcx - jnz .Lsqrx8x_tail - - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_tail_done - - subq 16+8(%rsp),%rsi - movq 48+8(%rsp),%rdx - leaq 128(%rbp),%rbp - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - subq $8,%rcx - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail_done: - addq 24+8(%rsp),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - - movq %rsi,%rax - - subq 16+8(%rsp),%rsi -.Lsqrx8x_no_tail: - adcq 0(%rdi),%r8 -.byte 102,72,15,126,217 - adcq 8(%rdi),%r9 - movq 112(%rbp),%rsi -.byte 102,72,15,126,213 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq %rax,%rax - - movq 32+8(%rsp),%rbx - movq 64(%rdi,%rcx,1),%rdx - - movq %r8,0(%rdi) - leaq 64(%rdi),%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 64(%rdi,%rcx,1),%rdi - cmpq 8+8(%rsp),%r8 - jb .Lsqrx8x_reduction_loop - xorl %ebx,%ebx - subq %r15,%rsi - adcq %rbx,%rbx - movq %rcx,%r10 - orq %rbx,%rax - movq %rcx,%r9 - xorq $1,%rax - sarq $3+2,%rcx - - leaq (%rbp,%rax,8),%rbp -.byte 102,72,15,126,202 -.byte 102,72,15,126,206 - jmp .Lsqrx4x_sub - -.align 32 -.Lsqrx4x_sub: -.byte 0x66 - movq 0(%rdi),%r12 - movq 8(%rdi),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rdi),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rdi),%r15 - leaq 32(%rdi),%rdi - sbbq 32(%rbp),%r14 - movq %r12,0(%rdx) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp - movq %r13,8(%rdx) - movq %r14,16(%rdx) - movq %r15,24(%rdx) - leaq 32(%rdx),%rdx - - incq %rcx - jnz .Lsqrx4x_sub - negq %r9 - - .byte 0xf3,0xc3 -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal -.globl bn_get_bits5 -.type bn_get_bits5,@function -.align 16 -bn_get_bits5: - leaq 0(%rdi),%r10 - leaq 1(%rdi),%r11 - movl %esi,%ecx - shrl $4,%esi - andl $15,%ecx - leal -8(%rcx),%eax - cmpl $11,%ecx - cmovaq %r11,%r10 - cmoval %eax,%ecx - movzwl (%r10,%rsi,2),%eax - shrl %cl,%eax - andl $31,%eax - .byte 0xf3,0xc3 -.size bn_get_bits5,.-bn_get_bits5 - -.globl bn_scatter5 -.type bn_scatter5,@function -.align 16 -bn_scatter5: - cmpl $0,%esi - jz .Lscatter_epilogue - leaq (%rdx,%rcx,8),%rdx -.Lscatter: - movq (%rdi),%rax - leaq 8(%rdi),%rdi - movq %rax,(%rdx) - leaq 256(%rdx),%rdx - subl $1,%esi - jnz .Lscatter -.Lscatter_epilogue: - .byte 0xf3,0xc3 -.size bn_scatter5,.-bn_scatter5 - -.globl bn_gather5 -.type bn_gather5,@function -.align 16 -bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq .Lmagic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 - jmp .Lgather -.align 16 -.Lgather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 - - movq %xmm0,(%rdi) - leaq 8(%rdi),%rdi - subl $1,%esi - jnz .Lgather + leaq (%r10),%rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 .align 64 -.Lmagic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s index 3a999664c74dc6..7876e382994517 100644 --- a/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/ec/ecp_nistz256-x86_64.s @@ -332,8 +332,6 @@ ecp_nistz256_neg: .type ecp_nistz256_to_mont,@function .align 32 ecp_nistz256_to_mont: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx leaq .LRR(%rip),%rdx jmp .Lmul_mont .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -348,8 +346,6 @@ ecp_nistz256_to_mont: .type ecp_nistz256_mul_mont,@function .align 32 ecp_nistz256_mul_mont: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp pushq %rbx @@ -357,8 +353,6 @@ ecp_nistz256_mul_mont: pushq %r13 pushq %r14 pushq %r15 - cmpl $524544,%ecx - je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -367,19 +361,6 @@ ecp_nistz256_mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp .Lmul_mont_done - -.align 32 -.Lmul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx .Lmul_mont_done: popq %r15 popq %r14 @@ -617,33 +598,18 @@ __ecp_nistz256_mul_montq: .type ecp_nistz256_sqr_mont,@function .align 32 ecp_nistz256_sqr_mont: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - cmpl $524544,%ecx - je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp .Lsqr_mont_done - -.align 32 -.Lsqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx .Lsqr_mont_done: popq %r15 popq %r14 @@ -815,304 +781,6 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq -.type __ecp_nistz256_mul_montx,@function -.align 32 -__ecp_nistz256_mul_montx: - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq .Lpoly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - - - - addq %rbp,%r9 - adcq %rcx,%r10 - - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq .Lpoly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx - -.type __ecp_nistz256_sqr_montx,@function -.align 32 -__ecp_nistz256_sqr_montx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq %r8,%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq .Lpoly+24(%rip),%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %rbp,%rcx,%r8 - movq %r9,%rdx - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %rbp,%rcx,%r9 - movq %r10,%rdx - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %rbp,%rcx,%r10 - movq %r11,%rdx - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %rbp,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - adcq %r8,%r12 - movq .Lpoly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - xorl %eax,%eax - sbbq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 -.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx @@ -1215,9 +883,6 @@ ecp_nistz256_from_mont: .type ecp_nistz256_select_w5,@function .align 32 ecp_nistz256_select_w5: - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_select_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -1277,9 +942,6 @@ ecp_nistz256_select_w5: .type ecp_nistz256_select_w7,@function .align 32 ecp_nistz256_select_w7: - movl OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz .Lavx2_select_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -1321,141 +983,11 @@ ecp_nistz256_select_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 - - -.type ecp_nistz256_avx2_select_w5,@function -.align 32 -ecp_nistz256_avx2_select_w5: -.Lavx2_select_w5: - vzeroupper - vmovdqa .LTwo(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa .LOne(%rip),%ymm5 - vmovdqa .LTwo(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -.Lselect_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz .Lselect_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 - - - .globl ecp_nistz256_avx2_select_w7 .type ecp_nistz256_avx2_select_w7,@function .align 32 ecp_nistz256_avx2_select_w7: -.Lavx2_select_w7: - vzeroupper - vmovdqa .LThree(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa .LOne(%rip),%ymm4 - vmovdqa .LTwo(%rip),%ymm8 - vmovdqa .LThree(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -.Lselect_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz .Lselect_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper +.byte 0x0f,0x0b .byte 0xf3,0xc3 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 .type __ecp_nistz256_add_toq,@function @@ -1581,10 +1113,6 @@ __ecp_nistz256_mul_by_2q: .type ecp_nistz256_point_double,@function .align 32 ecp_nistz256_point_double: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je .Lpoint_doublex pushq %rbp pushq %rbx pushq %r12 @@ -1593,6 +1121,7 @@ ecp_nistz256_point_double: pushq %r15 subq $160+8,%rsp +.Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -1786,10 +1315,6 @@ ecp_nistz256_point_double: .type ecp_nistz256_point_add,@function .align 32 ecp_nistz256_point_add: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je .Lpoint_addx pushq %rbp pushq %rbx pushq %r12 @@ -1817,7 +1342,7 @@ ecp_nistz256_point_add: por %xmm1,%xmm3 movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1827,7 +1352,7 @@ ecp_nistz256_point_add: movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1847,10 +1372,10 @@ ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 @@ -1859,6 +1384,7 @@ ecp_nistz256_point_add: movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi @@ -1950,7 +1476,7 @@ ecp_nistz256_point_add: testq %r8,%r8 jnz .Ladd_proceedq testq %r9,%r9 - jz .Ladd_proceedq + jz .Ladd_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 @@ -1962,6 +1488,13 @@ ecp_nistz256_point_add: movdqu %xmm0,80(%rdi) jmp .Ladd_doneq +.align 32 +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp .Lpoint_double_shortcutq + .align 32 .Ladd_proceedq: movq 0+64(%rsp),%rax @@ -2179,10 +1712,6 @@ ecp_nistz256_point_add: .type ecp_nistz256_point_add_affine,@function .align 32 ecp_nistz256_point_add_affine: - movl $524544,%ecx - andl OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je .Lpoint_add_affinex pushq %rbp pushq %rbx pushq %r12 @@ -2213,13 +1742,13 @@ ecp_nistz256_point_add_affine: por %xmm1,%xmm3 movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -2235,13 +1764,13 @@ ecp_nistz256_point_add_affine: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 @@ -2482,1023 +2011,3 @@ ecp_nistz256_point_add_affine: popq %rbp .byte 0xf3,0xc3 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine -.type __ecp_nistz256_add_tox,@function -.align 32 -__ecp_nistz256_add_tox: - xorq %r11,%r11 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox - -.type __ecp_nistz256_sub_fromx,@function -.align 32 -__ecp_nistz256_sub_fromx: - xorq %r11,%r11 - sbbq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq $0,%r11 - - xorq %r10,%r10 - adcq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx - -.type __ecp_nistz256_subx,@function -.align 32 -__ecp_nistz256_subx: - xorq %r11,%r11 - sbbq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq $0,%r11 - - xorq %r9,%r9 - adcq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 - - .byte 0xf3,0xc3 -.size __ecp_nistz256_subx,.-__ecp_nistz256_subx - -.type __ecp_nistz256_mul_by_2x,@function -.align 32 -__ecp_nistz256_mul_by_2x: - xorq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x -.type ecp_nistz256_point_doublex,@function -.align 32 -ecp_nistz256_point_doublex: -.Lpoint_doublex: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $160+8,%rsp - - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq .Lpoly+8(%rip),%r14 - movq .Lpoly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 32(%rbx),%rdx - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rdx - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 0+32(%rsp),%rdx - movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subx - - movq 32(%rsp),%rdx - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx - - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 -.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex -.type ecp_nistz256_point_addx,@function -.align 32 -ecp_nistz256_point_addx: -.Lpoint_addx: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576+8,%rsp - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - por %xmm0,%xmm1 - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - por %xmm2,%xmm3 - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm1,%xmm3 - - movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 - - leaq 64-128(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 416(%rsp),%rdx - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 512(%rsp),%rdx - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 480(%rsp),%rdx - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 0x3e - jnz .Ladd_proceedx -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - testq %r8,%r8 - jnz .Ladd_proceedx - testq %r9,%r9 - jz .Ladd_proceedx - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp .Ladd_donex - -.align 32 -.Ladd_proceedx: - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0(%rsp),%rdx - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 160(%rsp),%rdx - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - movq 0(%rsi),%rax - cmovzq %rbp,%r13 - movq 8(%rsi),%rbp - cmovzq %rcx,%r8 - movq 16(%rsi),%rcx - cmovzq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -.Ladd_donex: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 -.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx -.type ecp_nistz256_point_add_affinex,@function -.align 32 -ecp_nistz256_point_add_affinex: -.Lpoint_add_affinex: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $480+8,%rsp - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - por %xmm0,%xmm1 - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - por %xmm2,%xmm3 - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm1,%xmm3 - - movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 - movq 0(%rbx),%rdx - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-128(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+96(%rsp),%rdx - movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - movq 0(%rsi),%rax - cmovzq %rbp,%r13 - movq 8(%rsi),%rbp - cmovzq %rcx,%r8 - movq 16(%rsi),%rcx - cmovzq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rdx - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand .LONE_mont(%rip),%xmm2 - pand .LONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 -.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex diff --git a/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s b/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s index 4e82736a3eaf3a..35ebd9b4e09076 100644 --- a/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/modes/aesni-gcm-x86_64.s @@ -1,753 +1,15 @@ .text -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp .Loop6x - -.align 32 -.Loop6x: - addl $100663296,%ebx - jc .Lhandle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -.Lresume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $16,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $1,%xmm3,%xmm7,%xmm6 - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $17,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $96,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $16,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $1,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $17,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $16,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $1,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $17,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $16,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $1,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $17,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $16,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $1,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $17,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $16,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $1,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $17,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $16,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp .Lenc_tail - -.align 32 -.Lhandle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp .Lresume_ctr32 - -.align 32 -.Lenc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $16,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $96,%r10 - subq $6,%rdx - jc .L6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp .Loop6x - -.L6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +aesni_gcm_encrypt: + xorl %eax,%eax .byte 0xf3,0xc3 -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function -.align 32 aesni_gcm_decrypt: - xorq %r10,%r10 - cmpq $96,%rdx - jb .Lgcm_dec_abort - - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $3968,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Ldec_no_key_aliasing - cmpq $768,%r15 - jnc .Ldec_no_key_aliasing - subq %r15,%rsp -.Ldec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lgcm_dec_abort: - movq %r10,%rax + xorl %eax,%eax .byte 0xf3,0xc3 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc .Lhandle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 - -.align 16 -.Loop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz .Loop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.align 32 -.Lhandle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x - -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: - xorq %r10,%r10 - cmpq $288,%rdx - jb .Lgcm_enc_abort - - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $3968,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Lenc_no_key_aliasing - cmpq $768,%r15 - jnc .Lenc_no_key_aliasing - subq %r15,%rsp -.Lenc_no_key_aliasing: - - leaq (%rsi),%r14 - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $17,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $17,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $16,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $17,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $17,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $16,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $17,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $17,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $16,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $17,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $17,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $16,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $17,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $16,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $17,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $16,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $16,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $17,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0,%xmm0,%xmm8,%xmm5 - vpclmulqdq $17,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $16,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $16,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $16,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lgcm_enc_abort: - movq %r10,%rax - .byte 0xf3,0xc3 -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lpoly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.Lone_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -.Ltwo_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lone_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 diff --git a/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s index 1cfe19cb55ecf5..e9ffdc2de2ea85 100644 --- a/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/modes/ghash-x86_64.s @@ -20,14 +20,14 @@ gcm_gmult_4bit: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp .Loop1 .align 16 .Loop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -43,13 +43,13 @@ gcm_gmult_4bit: js .Lbreak1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -58,19 +58,19 @@ gcm_gmult_4bit: .align 16 .Lbreak1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -661,10 +661,10 @@ gcm_ghash_4bit: gcm_init_clmul: .L_init_clmul: movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 + pshufd $0b01001110,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 + pshufd $0b11111111,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 @@ -678,11 +678,11 @@ gcm_init_clmul: pxor %xmm5,%xmm2 - pshufd $78,%xmm2,%xmm6 + pshufd $0b01001110,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -718,8 +718,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm2,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 @@ -727,7 +727,7 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -765,7 +765,7 @@ gcm_init_clmul: pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -801,8 +801,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm5,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 @@ -822,7 +822,7 @@ gcm_gmult_clmul: movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -874,20 +874,20 @@ gcm_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 movl OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb .Lskip4x andl $71303168,%eax cmpl $4194304,%eax je .Lskip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -899,14 +899,14 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 @@ -921,12 +921,12 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 @@ -934,7 +934,7 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop @@ -949,14 +949,14 @@ gcm_ghash_clmul: movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 @@ -1000,7 +1000,7 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 @@ -1010,14 +1010,14 @@ gcm_ghash_clmul: movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: @@ -1061,10 +1061,10 @@ gcm_ghash_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail .Lskip4x: @@ -1079,7 +1079,7 @@ gcm_ghash_clmul: pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 @@ -1087,7 +1087,7 @@ gcm_ghash_clmul: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop @@ -1096,7 +1096,7 @@ gcm_ghash_clmul: .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1134,7 +1134,7 @@ gcm_ghash_clmul: pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 + pshufd $0b01001110,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 @@ -1150,13 +1150,13 @@ gcm_ghash_clmul: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja .Lmod_loop .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1204,7 +1204,7 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -1249,108 +1249,7 @@ gcm_ghash_clmul: .type gcm_init_avx,@function .align 32 gcm_init_avx: - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp .Linit_start_avx -.align 32 -.Linit_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -.Linit_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz .Linit_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 + jmp .L_init_clmul .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .type gcm_gmult_avx,@function @@ -1362,377 +1261,7 @@ gcm_gmult_avx: .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq .L0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu .Lbswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $128,%rcx - jb .Lshort_avx - subq $128,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $16,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $128,%rcx - jb .Ltail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $128,%rcx - jmp .Loop8x_avx - -.align 32 -.Loop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $16,(%r10),%xmm10,%xmm10 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $16,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $16,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $128,%rcx - jnc .Loop8x_avx - - addq $128,%rcx - jmp .Ltail_no_xor_avx - -.align 32 -.Lshort_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $16,%rcx - jmp .Ltail_avx - -.align 32 -.Ltail_avx: - vpxor %xmm10,%xmm15,%xmm15 -.Ltail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $16,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $16,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne .Lshort_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 + jmp .L_ghash_clmul .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s index 8da489ea45f931..4d25c99cf65bd3 100644 --- a/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/sha/sha1-mb-x86_64.s @@ -9,8 +9,6 @@ sha1_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax pushq %rbx pushq %rbp @@ -2601,10 +2599,10 @@ _shaext_shortcut: punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 - pshufd $63,%xmm7,%xmm1 - pshufd $127,%xmm7,%xmm9 - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00111111,%xmm7,%xmm1 + pshufd $0b01111111,%xmm7,%xmm9 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 jmp .Loop_shaext .align 32 @@ -2859,8 +2857,8 @@ _shaext_shortcut: .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 - pshufd $0,%xmm6,%xmm11 - pshufd $85,%xmm6,%xmm12 + pshufd $0x00,%xmm6,%xmm11 + pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 @@ -2890,8 +2888,8 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 @@ -2919,4291 +2917,6 @@ _shaext_shortcut: .Lepilogue_shaext: .byte 0xf3,0xc3 .size sha1_multi_block_shaext,.-sha1_multi_block_shaext -.type sha1_multi_block_avx,@function -.align 32 -sha1_multi_block_avx: -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb .Lavx - testl $32,%ecx - jnz _avx2_shortcut - jmp .Lavx -.align 32 -.Lavx: - movq %rsp,%rax - pushq %rbx - pushq %rbp - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -.Lbody_avx: - leaq K_XX_XX(%rip),%rbp - leaq 256(%rsp),%rbx - - vzeroupper -.Loop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - movq 0(%rsi),%r8 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - movq 16(%rsi),%r9 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - movq 32(%rsi),%r10 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - movq 48(%rsi),%r11 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz .Ldone_avx - - vmovdqu 0(%rdi),%xmm10 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%xmm11 - vmovdqu 64(%rdi),%xmm12 - vmovdqu 96(%rdi),%xmm13 - vmovdqu 128(%rdi),%xmm14 - vmovdqu 96(%rbp),%xmm5 - jmp .Loop_avx - -.align 32 -.Loop_avx: - vmovdqa -32(%rbp),%xmm15 - vmovd (%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd (%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vmovd -60(%r8),%xmm1 - vpunpckldq %xmm2,%xmm0,%xmm0 - vmovd -60(%r9),%xmm9 - vpshufb %xmm5,%xmm0,%xmm0 - vpinsrd $1,-60(%r10),%xmm1,%xmm1 - vpinsrd $1,-60(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,0-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -56(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -56(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-56(%r10),%xmm2,%xmm2 - vpinsrd $1,-56(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,16-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -52(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -52(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-52(%r10),%xmm3,%xmm3 - vpinsrd $1,-52(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,32-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -48(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -48(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm4,%xmm4 - vpinsrd $1,-48(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,48-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -44(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -44(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-44(%r10),%xmm0,%xmm0 - vpinsrd $1,-44(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,64-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -40(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -40(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-40(%r10),%xmm1,%xmm1 - vpinsrd $1,-40(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,80-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -36(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -36(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-36(%r10),%xmm2,%xmm2 - vpinsrd $1,-36(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,96-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -32(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -32(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-32(%r10),%xmm3,%xmm3 - vpinsrd $1,-32(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,112-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -28(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -28(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm4,%xmm4 - vpinsrd $1,-28(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,128-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -24(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -24(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-24(%r10),%xmm0,%xmm0 - vpinsrd $1,-24(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,144-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -20(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -20(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-20(%r10),%xmm1,%xmm1 - vpinsrd $1,-20(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,160-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -16(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -16(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-16(%r10),%xmm2,%xmm2 - vpinsrd $1,-16(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,176-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -12(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -12(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-12(%r10),%xmm3,%xmm3 - vpinsrd $1,-12(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,192-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -8(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -8(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm4,%xmm4 - vpinsrd $1,-8(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,208-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -4(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -4(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vmovdqa 0-128(%rax),%xmm1 - vpinsrd $1,-4(%r10),%xmm0,%xmm0 - vpinsrd $1,-4(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - prefetcht0 63(%r8) - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,224-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - prefetcht0 63(%r9) - vpxor %xmm7,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - prefetcht0 63(%r10) - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - prefetcht0 63(%r11) - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 16-128(%rax),%xmm2 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 32-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,240-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 128-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 48-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,0-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 144-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 64-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,16-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 160-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 80-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,32-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 176-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 96-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,48-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 192-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 0(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 112-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,64-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 208-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 128-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,80-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 224-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,96-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 240-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 160-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,112-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 0-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 176-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,128-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 16-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 192-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,144-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 32-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 208-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,160-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 48-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 224-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,176-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 64-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 240-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,192-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 80-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 0-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,208-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 96-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 16-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,224-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 112-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 32-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,240-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 128-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 48-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,0-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 144-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 64-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,16-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 160-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 80-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,32-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 176-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 96-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,48-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 192-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 112-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,64-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 208-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 128-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,80-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 224-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 144-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,96-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 240-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 160-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,112-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 0-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 32(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 176-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 16-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,128-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 192-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 32-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,144-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 208-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 48-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,160-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 224-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 64-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,176-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 240-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 80-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,192-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 0-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 96-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,208-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 16-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 112-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,224-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 32-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 128-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,240-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 48-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 144-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,0-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 64-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 160-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,16-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 80-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 176-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,32-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 96-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 192-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,48-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 112-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 208-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,64-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 128-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 224-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,80-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 144-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 240-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,96-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 160-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 0-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,112-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 176-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 16-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,128-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 192-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 32-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,144-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 208-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 48-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,160-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 224-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 64-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,176-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 64(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 240-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,192-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 80-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 0-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,208-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 96-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 16-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,224-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 112-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 32-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,240-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 128-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 48-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,0-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 144-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 64-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,16-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 160-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 80-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,32-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 176-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 96-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,48-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 192-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 112-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,64-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 208-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 128-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,80-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 224-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 144-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,96-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 240-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 160-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,112-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 0-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 176-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 16-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 192-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 32-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 208-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 48-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 224-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 64-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 240-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 80-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 0-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 96-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 16-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 112-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - - vpsrld $27,%xmm11,%xmm9 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor %xmm13,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm7,%xmm12,%xmm12 - movl $1,%ecx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%xmm6 - vpxor %xmm8,%xmm8,%xmm8 - vmovdqa %xmm6,%xmm7 - vpcmpgtd %xmm8,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - - vpand %xmm7,%xmm10,%xmm10 - vpand %xmm7,%xmm11,%xmm11 - vpaddd 0(%rdi),%xmm10,%xmm10 - vpand %xmm7,%xmm12,%xmm12 - vpaddd 32(%rdi),%xmm11,%xmm11 - vpand %xmm7,%xmm13,%xmm13 - vpaddd 64(%rdi),%xmm12,%xmm12 - vpand %xmm7,%xmm14,%xmm14 - vpaddd 96(%rdi),%xmm13,%xmm13 - vpaddd 128(%rdi),%xmm14,%xmm14 - vmovdqu %xmm10,0(%rdi) - vmovdqu %xmm11,32(%rdi) - vmovdqu %xmm12,64(%rdi) - vmovdqu %xmm13,96(%rdi) - vmovdqu %xmm14,128(%rdi) - - vmovdqu %xmm6,(%rbx) - vmovdqu 96(%rbp),%xmm5 - decl %edx - jnz .Loop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz .Loop_grande_avx - -.Ldone_avx: - movq 272(%rsp),%rax - vzeroupper - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size sha1_multi_block_avx,.-sha1_multi_block_avx -.type sha1_multi_block_avx2,@function -.align 32 -sha1_multi_block_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -.Lbody_avx2: - leaq K_XX_XX(%rip),%rbp - shrl $1,%edx - - vzeroupper -.Loop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - movq 0(%rsi),%r12 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - movq 16(%rsi),%r13 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - movq 32(%rsi),%r14 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - movq 48(%rsi),%r15 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - movq 64(%rsi),%r8 - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - movq 80(%rsi),%r9 - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - movq 96(%rsi),%r10 - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - movq 112(%rsi),%r11 - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0(%rdi),%ymm0 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%ymm1 - leaq 256+128(%rsp),%rbx - vmovdqu 64(%rdi),%ymm2 - vmovdqu 96(%rdi),%ymm3 - vmovdqu 128(%rdi),%ymm4 - vmovdqu 96(%rbp),%ymm9 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - vmovdqa -32(%rbp),%ymm15 - vmovd (%r12),%xmm10 - leaq 64(%r12),%r12 - vmovd (%r8),%xmm12 - leaq 64(%r8),%r8 - vmovd (%r13),%xmm7 - leaq 64(%r13),%r13 - vmovd (%r9),%xmm6 - leaq 64(%r9),%r9 - vpinsrd $1,(%r14),%xmm10,%xmm10 - leaq 64(%r14),%r14 - vpinsrd $1,(%r10),%xmm12,%xmm12 - leaq 64(%r10),%r10 - vpinsrd $1,(%r15),%xmm7,%xmm7 - leaq 64(%r15),%r15 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,(%r11),%xmm6,%xmm6 - leaq 64(%r11),%r11 - vpunpckldq %ymm6,%ymm12,%ymm12 - vmovd -60(%r12),%xmm11 - vinserti128 $1,%xmm12,%ymm10,%ymm10 - vmovd -60(%r8),%xmm8 - vpshufb %ymm9,%ymm10,%ymm10 - vmovd -60(%r13),%xmm7 - vmovd -60(%r9),%xmm6 - vpinsrd $1,-60(%r14),%xmm11,%xmm11 - vpinsrd $1,-60(%r10),%xmm8,%xmm8 - vpinsrd $1,-60(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-60(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,0-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -56(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -56(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -56(%r13),%xmm7 - vmovd -56(%r9),%xmm6 - vpinsrd $1,-56(%r14),%xmm12,%xmm12 - vpinsrd $1,-56(%r10),%xmm8,%xmm8 - vpinsrd $1,-56(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-56(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,32-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -52(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -52(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -52(%r13),%xmm7 - vmovd -52(%r9),%xmm6 - vpinsrd $1,-52(%r14),%xmm13,%xmm13 - vpinsrd $1,-52(%r10),%xmm8,%xmm8 - vpinsrd $1,-52(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-52(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,64-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -48(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -48(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -48(%r13),%xmm7 - vmovd -48(%r9),%xmm6 - vpinsrd $1,-48(%r14),%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm8,%xmm8 - vpinsrd $1,-48(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-48(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,96-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -44(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -44(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -44(%r13),%xmm7 - vmovd -44(%r9),%xmm6 - vpinsrd $1,-44(%r14),%xmm10,%xmm10 - vpinsrd $1,-44(%r10),%xmm8,%xmm8 - vpinsrd $1,-44(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-44(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,128-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -40(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -40(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -40(%r13),%xmm7 - vmovd -40(%r9),%xmm6 - vpinsrd $1,-40(%r14),%xmm11,%xmm11 - vpinsrd $1,-40(%r10),%xmm8,%xmm8 - vpinsrd $1,-40(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-40(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,160-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -36(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -36(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -36(%r13),%xmm7 - vmovd -36(%r9),%xmm6 - vpinsrd $1,-36(%r14),%xmm12,%xmm12 - vpinsrd $1,-36(%r10),%xmm8,%xmm8 - vpinsrd $1,-36(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-36(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,192-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -32(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -32(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -32(%r13),%xmm7 - vmovd -32(%r9),%xmm6 - vpinsrd $1,-32(%r14),%xmm13,%xmm13 - vpinsrd $1,-32(%r10),%xmm8,%xmm8 - vpinsrd $1,-32(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-32(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,224-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -28(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -28(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -28(%r13),%xmm7 - vmovd -28(%r9),%xmm6 - vpinsrd $1,-28(%r14),%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm8,%xmm8 - vpinsrd $1,-28(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-28(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,256-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -24(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -24(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -24(%r13),%xmm7 - vmovd -24(%r9),%xmm6 - vpinsrd $1,-24(%r14),%xmm10,%xmm10 - vpinsrd $1,-24(%r10),%xmm8,%xmm8 - vpinsrd $1,-24(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-24(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,288-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -20(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -20(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -20(%r13),%xmm7 - vmovd -20(%r9),%xmm6 - vpinsrd $1,-20(%r14),%xmm11,%xmm11 - vpinsrd $1,-20(%r10),%xmm8,%xmm8 - vpinsrd $1,-20(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-20(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,320-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -16(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -16(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -16(%r13),%xmm7 - vmovd -16(%r9),%xmm6 - vpinsrd $1,-16(%r14),%xmm12,%xmm12 - vpinsrd $1,-16(%r10),%xmm8,%xmm8 - vpinsrd $1,-16(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-16(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,352-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -12(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -12(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -12(%r13),%xmm7 - vmovd -12(%r9),%xmm6 - vpinsrd $1,-12(%r14),%xmm13,%xmm13 - vpinsrd $1,-12(%r10),%xmm8,%xmm8 - vpinsrd $1,-12(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-12(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,384-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -8(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -8(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -8(%r13),%xmm7 - vmovd -8(%r9),%xmm6 - vpinsrd $1,-8(%r14),%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm8,%xmm8 - vpinsrd $1,-8(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-8(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,416-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -4(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -4(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovdqa 0-128(%rax),%ymm11 - vmovd -4(%r13),%xmm7 - vmovd -4(%r9),%xmm6 - vpinsrd $1,-4(%r14),%xmm10,%xmm10 - vpinsrd $1,-4(%r10),%xmm8,%xmm8 - vpinsrd $1,-4(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-4(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - prefetcht0 63(%r12) - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,448-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - prefetcht0 63(%r13) - vpxor %ymm6,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - prefetcht0 63(%r15) - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32-128(%rax),%ymm12 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 64-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - prefetcht0 63(%r8) - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,480-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 256-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - prefetcht0 63(%r9) - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - prefetcht0 63(%r10) - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - prefetcht0 63(%r11) - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 96-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,0-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 288-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 128-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,32-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 320-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 160-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,64-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 352-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 192-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,96-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 384-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 0(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 224-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,128-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 416-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 256-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,160-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 448-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 288-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,192-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 480-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 320-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,224-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 0-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 352-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,256-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 32-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 384-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,288-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 64-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 416-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,320-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 96-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 448-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,352-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 128-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 480-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,384-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 160-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 0-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,416-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 192-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 32-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,448-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 224-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 64-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,480-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 256-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 96-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,0-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 288-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 128-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,32-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 320-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 160-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,64-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 352-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 192-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,96-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 384-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 224-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,128-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 416-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 256-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,160-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 448-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 288-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,192-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 480-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 320-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,224-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 0-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 352-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 32-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,256-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 384-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 64-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,288-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 416-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 96-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,320-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 448-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 128-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,352-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 480-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 160-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,384-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 0-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 192-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,416-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 32-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 224-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,448-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 64-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 256-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,480-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 96-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 288-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,0-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 128-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 320-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,32-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 160-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 352-256-128(%rbx),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,64-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 192-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 384-256-128(%rbx),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,96-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 224-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 416-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,128-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 256-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 448-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,160-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 288-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 480-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,192-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 320-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 0-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,224-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 352-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 32-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,256-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 384-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 64-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,288-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 416-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 96-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,320-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 448-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 128-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,352-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 64(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 480-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,384-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 160-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 0-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,416-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 192-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 32-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,448-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 224-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 64-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,480-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 256-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 96-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,0-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 288-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 128-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,32-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 320-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 160-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,64-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 352-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 192-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,96-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 384-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 224-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,128-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 416-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 256-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,160-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 448-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 288-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,192-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 480-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 320-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,224-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 0-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 352-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 32-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 384-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 64-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 416-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 96-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 448-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 128-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 480-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 160-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 0-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 192-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 32-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 224-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - - vpsrld $27,%ymm1,%ymm8 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor %ymm3,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm6,%ymm2,%ymm2 - movl $1,%ecx - leaq 512(%rsp),%rbx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%ymm5 - vpxor %ymm7,%ymm7,%ymm7 - vmovdqa %ymm5,%ymm6 - vpcmpgtd %ymm7,%ymm6,%ymm6 - vpaddd %ymm6,%ymm5,%ymm5 - - vpand %ymm6,%ymm0,%ymm0 - vpand %ymm6,%ymm1,%ymm1 - vpaddd 0(%rdi),%ymm0,%ymm0 - vpand %ymm6,%ymm2,%ymm2 - vpaddd 32(%rdi),%ymm1,%ymm1 - vpand %ymm6,%ymm3,%ymm3 - vpaddd 64(%rdi),%ymm2,%ymm2 - vpand %ymm6,%ymm4,%ymm4 - vpaddd 96(%rdi),%ymm3,%ymm3 - vpaddd 128(%rdi),%ymm4,%ymm4 - vmovdqu %ymm0,0(%rdi) - vmovdqu %ymm1,32(%rdi) - vmovdqu %ymm2,64(%rdi) - vmovdqu %ymm3,96(%rdi) - vmovdqu %ymm4,128(%rdi) - - vmovdqu %ymm5,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu 96(%rbp),%ymm9 - decl %edx - jnz .Loop_avx2 - - - - - - - -.Ldone_avx2: - movq 544(%rsp),%rax - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s index 22a031f368e505..38e9956cb68384 100644 --- a/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/sha/sha1-x86_64.s @@ -12,14 +12,6 @@ sha1_block_data_order: jz .Lialu testl $536870912,%r10d jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut jmp _ssse3_shortcut .align 16 @@ -1248,9 +1240,9 @@ _shaext_shortcut: movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 + pshufd $0b00011011,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 @@ -1400,8 +1392,8 @@ _shaext_shortcut: jnz .Loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 @@ -2582,2803 +2574,6 @@ _ssse3_shortcut: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 -.type sha1_block_data_order_avx,@function -.align 16 -sha1_block_data_order_avx: -_avx_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - leaq -64(%rsp),%rsp - vzeroupper - movq %rax,%r14 - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm11,%xmm0,%xmm4 - vpaddd %xmm11,%xmm1,%xmm5 - vpaddd %xmm11,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - jmp .Loop_avx -.align 16 -.Loop_avx: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm11,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm10 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm4,%xmm4 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vpxor %xmm10,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm11,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm10 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm10,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vpaddd %xmm5,%xmm11,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm10 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm6,%xmm6 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm10,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm11,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm10 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm10,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm11,%xmm9 - addl %esi,%edx - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - vpaddd %xmm3,%xmm11,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm11,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm11,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm11,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm11,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm11,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm11,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je .Ldone_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm11,%xmm0,%xmm4 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm11,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm5,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm11,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm6,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp .Loop_avx - -.align 16 -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroupper - - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size sha1_block_data_order_avx,.-sha1_block_data_order_avx -.type sha1_block_data_order_avx2,@function -.align 16 -sha1_block_data_order_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - vzeroupper - movq %rax,%r14 - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - leaq -640(%rsp),%rsp - shlq $6,%r10 - leaq 64(%r9),%r13 - andq $-128,%rsp - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 - - movl 0(%r8),%eax - cmpq %r10,%r13 - cmovaeq %r9,%r13 - movl 4(%r8),%ebp - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl 16(%r8),%esi - vmovdqu 64(%r11),%ymm6 - - vmovdqu (%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - leaq 64(%r9),%r9 - vinserti128 $1,(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vpshufb %ymm6,%ymm0,%ymm0 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vpshufb %ymm6,%ymm1,%ymm1 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r11),%ymm11 - vpshufb %ymm6,%ymm3,%ymm3 - - vpaddd %ymm11,%ymm0,%ymm4 - vpaddd %ymm11,%ymm1,%ymm5 - vmovdqu %ymm4,0(%rsp) - vpaddd %ymm11,%ymm2,%ymm6 - vmovdqu %ymm5,32(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - vmovdqu %ymm6,64(%rsp) - vmovdqu %ymm7,96(%rsp) - vpalignr $8,%ymm0,%ymm1,%ymm4 - vpsrldq $4,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $31,%ymm4,%ymm8 - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - vpxor %ymm10,%ymm4,%ymm4 - vpaddd %ymm11,%ymm4,%ymm9 - vmovdqu %ymm9,128(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm5 - vpsrldq $4,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm5,%ymm5 - vpaddd %ymm11,%ymm5,%ymm9 - vmovdqu %ymm9,160(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm6 - vpsrldq $4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $31,%ymm6,%ymm8 - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - vpxor %ymm10,%ymm6,%ymm6 - vpaddd %ymm11,%ymm6,%ymm9 - vmovdqu %ymm9,192(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm7 - vpsrldq $4,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm7,%ymm8 - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - vpxor %ymm10,%ymm7,%ymm7 - vpaddd %ymm11,%ymm7,%ymm9 - vmovdqu %ymm9,224(%rsp) - leaq 128(%rsp),%r13 - jmp .Loop_avx2 -.align 32 -.Loop_avx2: - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - jmp .Lalign32_1 -.align 32 -.Lalign32_1: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - vpxor %ymm1,%ymm0,%ymm0 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpxor %ymm8,%ymm0,%ymm0 - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vpor %ymm8,%ymm0,%ymm0 - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - vpaddd %ymm11,%ymm0,%ymm9 - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - vmovdqu %ymm9,256(%rsp) - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - vpxor %ymm2,%ymm1,%ymm1 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpxor %ymm8,%ymm1,%ymm1 - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vpor %ymm8,%ymm1,%ymm1 - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - vpaddd %ymm11,%ymm1,%ymm9 - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vmovdqu %ymm9,288(%rsp) - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r11),%ymm11 - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpxor %ymm8,%ymm2,%ymm2 - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vpor %ymm8,%ymm2,%ymm2 - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - vpaddd %ymm11,%ymm2,%ymm9 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vmovdqu %ymm9,320(%rsp) - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - vpxor %ymm4,%ymm3,%ymm3 - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpxor %ymm8,%ymm3,%ymm3 - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - vpor %ymm8,%ymm3,%ymm3 - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - vpaddd %ymm11,%ymm3,%ymm9 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vmovdqu %ymm9,352(%rsp) - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpalignr $8,%ymm2,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpxor %ymm5,%ymm4,%ymm4 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpxor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - vpsrld $30,%ymm4,%ymm8 - vpslld $2,%ymm4,%ymm4 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpor %ymm8,%ymm4,%ymm4 - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpaddd %ymm11,%ymm4,%ymm9 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - vmovdqu %ymm9,384(%rsp) - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpalignr $8,%ymm3,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm6,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpxor %ymm8,%ymm5,%ymm5 - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - vpsrld $30,%ymm5,%ymm8 - vpslld $2,%ymm5,%ymm5 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vpor %ymm8,%ymm5,%ymm5 - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - vmovdqu %ymm9,416(%rsp) - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm7,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - vpxor %ymm8,%ymm6,%ymm6 - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - vpsrld $30,%ymm6,%ymm8 - vpslld $2,%ymm6,%ymm6 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpor %ymm8,%ymm6,%ymm6 - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - vmovdqu %ymm9,448(%rsp) - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm5,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r11),%ymm11 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpxor %ymm8,%ymm7,%ymm7 - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - vpsrld $30,%ymm7,%ymm8 - vpslld $2,%ymm7,%ymm7 - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpor %ymm8,%ymm7,%ymm7 - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - vmovdqu %ymm9,480(%rsp) - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - jmp .Lalign32_2 -.align 32 -.Lalign32_2: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -64(%r13),%ebp - xorl %esi,%ecx - vpxor %ymm1,%ymm0,%ymm0 - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - vpxor %ymm8,%ymm0,%ymm0 - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - vpor %ymm8,%ymm0,%ymm0 - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpaddd %ymm11,%ymm0,%ymm9 - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - vmovdqu %ymm9,512(%rsp) - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -28(%r13),%ebx - xorl %eax,%edx - vpxor %ymm2,%ymm1,%ymm1 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpxor %ymm8,%ymm1,%ymm1 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - vpor %ymm8,%ymm1,%ymm1 - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpaddd %ymm11,%ymm1,%ymm9 - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - vmovdqu %ymm9,544(%rsp) - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl 8(%r13),%ecx - xorl %ebp,%esi - vpxor %ymm3,%ymm2,%ymm2 - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - vpxor %ymm8,%ymm2,%ymm2 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - vpor %ymm8,%ymm2,%ymm2 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpaddd %ymm11,%ymm2,%ymm9 - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - vmovdqu %ymm9,576(%rsp) - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl 44(%r13),%edx - xorl %ebx,%eax - vpxor %ymm4,%ymm3,%ymm3 - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm3,%ymm3 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl %r12d,%edx - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - vpor %ymm8,%ymm3,%ymm3 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpaddd %ymm11,%ymm3,%ymm9 - addl %r12d,%ecx - andl %edi,%edx - addl 68(%r13),%ebx - xorl %eax,%edx - vmovdqu %ymm9,608(%rsp) - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -96(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -60(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%r9),%r13 - leaq 128(%r9),%rdi - cmpq %r10,%r13 - cmovaeq %r9,%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - je .Ldone_avx2 - vmovdqu 64(%r11),%ymm6 - cmpq %r10,%rdi - ja .Last_avx2 - - vmovdqu -64(%rdi),%xmm0 - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 $1,0(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - jmp .Last_avx2 - -.align 32 -.Last_avx2: - leaq 128+16(%rsp),%r13 - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - subq $-128,%r9 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vmovdqu -64(%r11),%ymm11 - vpshufb %ymm6,%ymm0,%ymm0 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpshufb %ymm6,%ymm1,%ymm1 - vpaddd %ymm11,%ymm0,%ymm8 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vmovdqu %ymm8,0(%rsp) - vpshufb %ymm6,%ymm2,%ymm2 - vpaddd %ymm11,%ymm1,%ymm9 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - vmovdqu %ymm9,32(%rsp) - vpshufb %ymm6,%ymm3,%ymm3 - vpaddd %ymm11,%ymm2,%ymm6 - addl -64(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - jmp .Lalign32_3 -.align 32 -.Lalign32_3: - vmovdqu %ymm6,64(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - addl -28(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vmovdqu %ymm7,96(%rsp) - addl 8(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm0,%ymm1,%ymm4 - addl 44(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - vpsrldq $4,%ymm3,%ymm8 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - xorl %ebp,%esi - addl %r12d,%edx - vpxor %ymm8,%ymm4,%ymm4 - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - vpsrld $31,%ymm4,%ymm8 - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - andl %edi,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - addl 68(%r13),%ebx - xorl %eax,%edx - vpxor %ymm10,%ymm4,%ymm4 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpaddd %ymm11,%ymm4,%ymm9 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vmovdqu %ymm9,128(%rsp) - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm1,%ymm2,%ymm5 - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrldq $4,%ymm4,%ymm8 - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - xorl %eax,%edx - addl %r12d,%ecx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - vpxor %ymm10,%ymm5,%ymm5 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vmovdqu %ymm9,160(%rsp) - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm2,%ymm3,%ymm6 - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpsrldq $4,%ymm5,%ymm8 - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm8,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - vpsrld $31,%ymm6,%ymm8 - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - xorl %ebp,%esi - addl %r12d,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - xorl %ebx,%esi - addl -96(%r13),%ecx - vpxor %ymm10,%ymm6,%ymm6 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vmovdqu %ymm9,192(%rsp) - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpalignr $8,%ymm3,%ymm4,%ymm7 - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpsrldq $4,%ymm6,%ymm8 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm8,%ymm7,%ymm7 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - vpsrld $31,%ymm7,%ymm8 - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - xorl %ebx,%eax - addl %r12d,%esi - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - xorl %ecx,%eax - addl -60(%r13),%edx - vpxor %ymm10,%ymm7,%ymm7 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vmovdqu %ymm9,224(%rsp) - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%rsp),%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - jbe .Loop_avx2 - -.Ldone_avx2: - vzeroupper - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s index 0c06094084e48a..7655283b9885b9 100644 --- a/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/sha/sha256-mb-x86_64.s @@ -9,8 +9,6 @@ sha256_multi_block: movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax pushq %rbx pushq %rbp @@ -2679,10 +2677,10 @@ _shaext_shortcut: punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 jmp .Loop_shaext .align 32 @@ -2714,11 +2712,11 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 @@ -2736,11 +2734,11 @@ _shaext_shortcut: .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2753,14 +2751,14 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2777,13 +2775,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2799,13 +2797,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2821,13 +2819,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2843,13 +2841,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2865,13 +2863,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2887,13 +2885,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2909,13 +2907,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2931,13 +2929,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2953,13 +2951,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2975,13 +2973,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2997,13 +2995,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -3020,13 +3018,13 @@ _shaext_shortcut: pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 @@ -3036,17 +3034,17 @@ _shaext_shortcut: cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 - pshufd $0,%xmm7,%xmm9 + pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 - pshufd $85,%xmm7,%xmm10 + pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 @@ -3068,10 +3066,10 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 @@ -3107,4648 +3105,6 @@ _shaext_shortcut: .Lepilogue_shaext: .byte 0xf3,0xc3 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext -.type sha256_multi_block_avx,@function -.align 32 -sha256_multi_block_avx: -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb .Lavx - testl $32,%ecx - jnz _avx2_shortcut - jmp .Lavx -.align 32 -.Lavx: - movq %rsp,%rax - pushq %rbx - pushq %rbp - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -.Lbody_avx: - leaq K256+128(%rip),%rbp - leaq 256(%rsp),%rbx - leaq 128(%rdi),%rdi - -.Loop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - movq 0(%rsi),%r8 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - movq 16(%rsi),%r9 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - movq 32(%rsi),%r10 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - movq 48(%rsi),%r11 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz .Ldone_avx - - vmovdqu 0-128(%rdi),%xmm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - vmovdqu 96-128(%rdi),%xmm11 - vmovdqu 128-128(%rdi),%xmm12 - vmovdqu 160-128(%rdi),%xmm13 - vmovdqu 192-128(%rdi),%xmm14 - vmovdqu 224-128(%rdi),%xmm15 - vmovdqu .Lpbswap(%rip),%xmm6 - jmp .Loop_avx - -.align 32 -.Loop_avx: - vpxor %xmm9,%xmm10,%xmm4 - vmovd 0(%r8),%xmm5 - vmovd 0(%r9),%xmm0 - vpinsrd $1,0(%r10),%xmm5,%xmm5 - vpinsrd $1,0(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 4(%r8),%xmm5 - vmovd 4(%r9),%xmm0 - vpinsrd $1,4(%r10),%xmm5,%xmm5 - vpinsrd $1,4(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,16-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 8(%r8),%xmm5 - vmovd 8(%r9),%xmm0 - vpinsrd $1,8(%r10),%xmm5,%xmm5 - vpinsrd $1,8(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 12(%r8),%xmm5 - vmovd 12(%r9),%xmm0 - vpinsrd $1,12(%r10),%xmm5,%xmm5 - vpinsrd $1,12(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,48-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 16(%r8),%xmm5 - vmovd 16(%r9),%xmm0 - vpinsrd $1,16(%r10),%xmm5,%xmm5 - vpinsrd $1,16(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 20(%r8),%xmm5 - vmovd 20(%r9),%xmm0 - vpinsrd $1,20(%r10),%xmm5,%xmm5 - vpinsrd $1,20(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,80-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 24(%r8),%xmm5 - vmovd 24(%r9),%xmm0 - vpinsrd $1,24(%r10),%xmm5,%xmm5 - vpinsrd $1,24(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 28(%r8),%xmm5 - vmovd 28(%r9),%xmm0 - vpinsrd $1,28(%r10),%xmm5,%xmm5 - vpinsrd $1,28(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,112-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovd 32(%r8),%xmm5 - vmovd 32(%r9),%xmm0 - vpinsrd $1,32(%r10),%xmm5,%xmm5 - vpinsrd $1,32(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 36(%r8),%xmm5 - vmovd 36(%r9),%xmm0 - vpinsrd $1,36(%r10),%xmm5,%xmm5 - vpinsrd $1,36(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,144-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 40(%r8),%xmm5 - vmovd 40(%r9),%xmm0 - vpinsrd $1,40(%r10),%xmm5,%xmm5 - vpinsrd $1,40(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 44(%r8),%xmm5 - vmovd 44(%r9),%xmm0 - vpinsrd $1,44(%r10),%xmm5,%xmm5 - vpinsrd $1,44(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,176-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 48(%r8),%xmm5 - vmovd 48(%r9),%xmm0 - vpinsrd $1,48(%r10),%xmm5,%xmm5 - vpinsrd $1,48(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 52(%r8),%xmm5 - vmovd 52(%r9),%xmm0 - vpinsrd $1,52(%r10),%xmm5,%xmm5 - vpinsrd $1,52(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,208-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 56(%r8),%xmm5 - vmovd 56(%r9),%xmm0 - vpinsrd $1,56(%r10),%xmm5,%xmm5 - vpinsrd $1,56(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 60(%r8),%xmm5 - leaq 64(%r8),%r8 - vmovd 60(%r9),%xmm0 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r10),%xmm5,%xmm5 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r11),%xmm0,%xmm0 - leaq 64(%r11),%r11 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,240-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r8) - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - prefetcht0 63(%r9) - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r10) - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - prefetcht0 63(%r11) - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%xmm5 - movl $3,%ecx - jmp .Loop_16_xx_avx -.align 32 -.Loop_16_xx_avx: - vmovdqu 16-128(%rax),%xmm6 - vpaddd 144-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 224-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 32-128(%rax),%xmm5 - vpaddd 160-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 240-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,16-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 48-128(%rax),%xmm6 - vpaddd 176-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 0-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 64-128(%rax),%xmm5 - vpaddd 192-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 16-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,48-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 80-128(%rax),%xmm6 - vpaddd 208-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 32-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 96-128(%rax),%xmm5 - vpaddd 224-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 48-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,80-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 112-128(%rax),%xmm6 - vpaddd 240-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 64-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 128-128(%rax),%xmm5 - vpaddd 0-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 80-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,112-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 144-128(%rax),%xmm6 - vpaddd 16-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 96-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 160-128(%rax),%xmm5 - vpaddd 32-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 112-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,144-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 176-128(%rax),%xmm6 - vpaddd 48-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 128-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 192-128(%rax),%xmm5 - vpaddd 64-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 144-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,176-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 208-128(%rax),%xmm6 - vpaddd 80-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 160-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 224-128(%rax),%xmm5 - vpaddd 96-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 176-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,208-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 240-128(%rax),%xmm6 - vpaddd 112-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 192-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 0-128(%rax),%xmm5 - vpaddd 128-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 208-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,240-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - decl %ecx - jnz .Loop_16_xx_avx - - movl $1,%ecx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%xmm7 - vpxor %xmm0,%xmm0,%xmm0 - vmovdqa %xmm7,%xmm6 - vpcmpgtd %xmm0,%xmm6,%xmm6 - vpaddd %xmm6,%xmm7,%xmm7 - - vmovdqu 0-128(%rdi),%xmm0 - vpand %xmm6,%xmm8,%xmm8 - vmovdqu 32-128(%rdi),%xmm1 - vpand %xmm6,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm2 - vpand %xmm6,%xmm10,%xmm10 - vmovdqu 96-128(%rdi),%xmm5 - vpand %xmm6,%xmm11,%xmm11 - vpaddd %xmm0,%xmm8,%xmm8 - vmovdqu 128-128(%rdi),%xmm0 - vpand %xmm6,%xmm12,%xmm12 - vpaddd %xmm1,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm1 - vpand %xmm6,%xmm13,%xmm13 - vpaddd %xmm2,%xmm10,%xmm10 - vmovdqu 192-128(%rdi),%xmm2 - vpand %xmm6,%xmm14,%xmm14 - vpaddd %xmm5,%xmm11,%xmm11 - vmovdqu 224-128(%rdi),%xmm5 - vpand %xmm6,%xmm15,%xmm15 - vpaddd %xmm0,%xmm12,%xmm12 - vpaddd %xmm1,%xmm13,%xmm13 - vmovdqu %xmm8,0-128(%rdi) - vpaddd %xmm2,%xmm14,%xmm14 - vmovdqu %xmm9,32-128(%rdi) - vpaddd %xmm5,%xmm15,%xmm15 - vmovdqu %xmm10,64-128(%rdi) - vmovdqu %xmm11,96-128(%rdi) - vmovdqu %xmm12,128-128(%rdi) - vmovdqu %xmm13,160-128(%rdi) - vmovdqu %xmm14,192-128(%rdi) - vmovdqu %xmm15,224-128(%rdi) - - vmovdqu %xmm7,(%rbx) - vmovdqu .Lpbswap(%rip),%xmm6 - decl %edx - jnz .Loop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz .Loop_grande_avx - -.Ldone_avx: - movq 272(%rsp),%rax - vzeroupper - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size sha256_multi_block_avx,.-sha256_multi_block_avx -.type sha256_multi_block_avx2,@function -.align 32 -sha256_multi_block_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -.Lbody_avx2: - leaq K256+128(%rip),%rbp - leaq 128(%rdi),%rdi - -.Loop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - movq 0(%rsi),%r12 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - movq 16(%rsi),%r13 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - movq 32(%rsi),%r14 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - movq 48(%rsi),%r15 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - movq 64(%rsi),%r8 - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - movq 80(%rsi),%r9 - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - movq 96(%rsi),%r10 - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - movq 112(%rsi),%r11 - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0-128(%rdi),%ymm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%ymm9 - leaq 256+128(%rsp),%rbx - vmovdqu 64-128(%rdi),%ymm10 - vmovdqu 96-128(%rdi),%ymm11 - vmovdqu 128-128(%rdi),%ymm12 - vmovdqu 160-128(%rdi),%ymm13 - vmovdqu 192-128(%rdi),%ymm14 - vmovdqu 224-128(%rdi),%ymm15 - vmovdqu .Lpbswap(%rip),%ymm6 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - vpxor %ymm9,%ymm10,%ymm4 - vmovd 0(%r12),%xmm5 - vmovd 0(%r8),%xmm0 - vmovd 0(%r13),%xmm1 - vmovd 0(%r9),%xmm2 - vpinsrd $1,0(%r14),%xmm5,%xmm5 - vpinsrd $1,0(%r10),%xmm0,%xmm0 - vpinsrd $1,0(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,0(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 4(%r12),%xmm5 - vmovd 4(%r8),%xmm0 - vmovd 4(%r13),%xmm1 - vmovd 4(%r9),%xmm2 - vpinsrd $1,4(%r14),%xmm5,%xmm5 - vpinsrd $1,4(%r10),%xmm0,%xmm0 - vpinsrd $1,4(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,4(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,32-128(%rax) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 8(%r12),%xmm5 - vmovd 8(%r8),%xmm0 - vmovd 8(%r13),%xmm1 - vmovd 8(%r9),%xmm2 - vpinsrd $1,8(%r14),%xmm5,%xmm5 - vpinsrd $1,8(%r10),%xmm0,%xmm0 - vpinsrd $1,8(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,8(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 12(%r12),%xmm5 - vmovd 12(%r8),%xmm0 - vmovd 12(%r13),%xmm1 - vmovd 12(%r9),%xmm2 - vpinsrd $1,12(%r14),%xmm5,%xmm5 - vpinsrd $1,12(%r10),%xmm0,%xmm0 - vpinsrd $1,12(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,12(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,96-128(%rax) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 16(%r12),%xmm5 - vmovd 16(%r8),%xmm0 - vmovd 16(%r13),%xmm1 - vmovd 16(%r9),%xmm2 - vpinsrd $1,16(%r14),%xmm5,%xmm5 - vpinsrd $1,16(%r10),%xmm0,%xmm0 - vpinsrd $1,16(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,16(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 20(%r12),%xmm5 - vmovd 20(%r8),%xmm0 - vmovd 20(%r13),%xmm1 - vmovd 20(%r9),%xmm2 - vpinsrd $1,20(%r14),%xmm5,%xmm5 - vpinsrd $1,20(%r10),%xmm0,%xmm0 - vpinsrd $1,20(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,20(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,160-128(%rax) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 24(%r12),%xmm5 - vmovd 24(%r8),%xmm0 - vmovd 24(%r13),%xmm1 - vmovd 24(%r9),%xmm2 - vpinsrd $1,24(%r14),%xmm5,%xmm5 - vpinsrd $1,24(%r10),%xmm0,%xmm0 - vpinsrd $1,24(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,24(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 28(%r12),%xmm5 - vmovd 28(%r8),%xmm0 - vmovd 28(%r13),%xmm1 - vmovd 28(%r9),%xmm2 - vpinsrd $1,28(%r14),%xmm5,%xmm5 - vpinsrd $1,28(%r10),%xmm0,%xmm0 - vpinsrd $1,28(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,28(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,224-128(%rax) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovd 32(%r12),%xmm5 - vmovd 32(%r8),%xmm0 - vmovd 32(%r13),%xmm1 - vmovd 32(%r9),%xmm2 - vpinsrd $1,32(%r14),%xmm5,%xmm5 - vpinsrd $1,32(%r10),%xmm0,%xmm0 - vpinsrd $1,32(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,32(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 36(%r12),%xmm5 - vmovd 36(%r8),%xmm0 - vmovd 36(%r13),%xmm1 - vmovd 36(%r9),%xmm2 - vpinsrd $1,36(%r14),%xmm5,%xmm5 - vpinsrd $1,36(%r10),%xmm0,%xmm0 - vpinsrd $1,36(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,36(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,288-256-128(%rbx) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 40(%r12),%xmm5 - vmovd 40(%r8),%xmm0 - vmovd 40(%r13),%xmm1 - vmovd 40(%r9),%xmm2 - vpinsrd $1,40(%r14),%xmm5,%xmm5 - vpinsrd $1,40(%r10),%xmm0,%xmm0 - vpinsrd $1,40(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,40(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 44(%r12),%xmm5 - vmovd 44(%r8),%xmm0 - vmovd 44(%r13),%xmm1 - vmovd 44(%r9),%xmm2 - vpinsrd $1,44(%r14),%xmm5,%xmm5 - vpinsrd $1,44(%r10),%xmm0,%xmm0 - vpinsrd $1,44(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,44(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,352-256-128(%rbx) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 48(%r12),%xmm5 - vmovd 48(%r8),%xmm0 - vmovd 48(%r13),%xmm1 - vmovd 48(%r9),%xmm2 - vpinsrd $1,48(%r14),%xmm5,%xmm5 - vpinsrd $1,48(%r10),%xmm0,%xmm0 - vpinsrd $1,48(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,48(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 52(%r12),%xmm5 - vmovd 52(%r8),%xmm0 - vmovd 52(%r13),%xmm1 - vmovd 52(%r9),%xmm2 - vpinsrd $1,52(%r14),%xmm5,%xmm5 - vpinsrd $1,52(%r10),%xmm0,%xmm0 - vpinsrd $1,52(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,52(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,416-256-128(%rbx) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 56(%r12),%xmm5 - vmovd 56(%r8),%xmm0 - vmovd 56(%r13),%xmm1 - vmovd 56(%r9),%xmm2 - vpinsrd $1,56(%r14),%xmm5,%xmm5 - vpinsrd $1,56(%r10),%xmm0,%xmm0 - vpinsrd $1,56(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,56(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 60(%r12),%xmm5 - leaq 64(%r12),%r12 - vmovd 60(%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd 60(%r13),%xmm1 - leaq 64(%r13),%r13 - vmovd 60(%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r14),%xmm5,%xmm5 - leaq 64(%r14),%r14 - vpinsrd $1,60(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r15),%xmm1,%xmm1 - leaq 64(%r15),%r15 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,60(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,480-256-128(%rbx) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r12) - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - prefetcht0 63(%r13) - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - prefetcht0 63(%r15) - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - prefetcht0 63(%r8) - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - prefetcht0 63(%r9) - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r10) - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - prefetcht0 63(%r11) - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%ymm5 - movl $3,%ecx - jmp .Loop_16_xx_avx2 -.align 32 -.Loop_16_xx_avx2: - vmovdqu 32-128(%rax),%ymm6 - vpaddd 288-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 448-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 64-128(%rax),%ymm5 - vpaddd 320-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 480-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,32-128(%rax) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 96-128(%rax),%ymm6 - vpaddd 352-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 0-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 128-128(%rax),%ymm5 - vpaddd 384-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 32-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,96-128(%rax) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 160-128(%rax),%ymm6 - vpaddd 416-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 64-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 192-128(%rax),%ymm5 - vpaddd 448-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 96-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,160-128(%rax) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 224-128(%rax),%ymm6 - vpaddd 480-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 128-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 256-256-128(%rbx),%ymm5 - vpaddd 0-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 160-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,224-128(%rax) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 288-256-128(%rbx),%ymm6 - vpaddd 32-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 192-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 320-256-128(%rbx),%ymm5 - vpaddd 64-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 224-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,288-256-128(%rbx) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 352-256-128(%rbx),%ymm6 - vpaddd 96-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 256-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 384-256-128(%rbx),%ymm5 - vpaddd 128-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 288-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,352-256-128(%rbx) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 416-256-128(%rbx),%ymm6 - vpaddd 160-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 320-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 448-256-128(%rbx),%ymm5 - vpaddd 192-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 352-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,416-256-128(%rbx) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 480-256-128(%rbx),%ymm6 - vpaddd 224-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 384-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 0-128(%rax),%ymm5 - vpaddd 256-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 416-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,480-256-128(%rbx) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - decl %ecx - jnz .Loop_16_xx_avx2 - - movl $1,%ecx - leaq 512(%rsp),%rbx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%ymm7 - vpxor %ymm0,%ymm0,%ymm0 - vmovdqa %ymm7,%ymm6 - vpcmpgtd %ymm0,%ymm6,%ymm6 - vpaddd %ymm6,%ymm7,%ymm7 - - vmovdqu 0-128(%rdi),%ymm0 - vpand %ymm6,%ymm8,%ymm8 - vmovdqu 32-128(%rdi),%ymm1 - vpand %ymm6,%ymm9,%ymm9 - vmovdqu 64-128(%rdi),%ymm2 - vpand %ymm6,%ymm10,%ymm10 - vmovdqu 96-128(%rdi),%ymm5 - vpand %ymm6,%ymm11,%ymm11 - vpaddd %ymm0,%ymm8,%ymm8 - vmovdqu 128-128(%rdi),%ymm0 - vpand %ymm6,%ymm12,%ymm12 - vpaddd %ymm1,%ymm9,%ymm9 - vmovdqu 160-128(%rdi),%ymm1 - vpand %ymm6,%ymm13,%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vmovdqu 192-128(%rdi),%ymm2 - vpand %ymm6,%ymm14,%ymm14 - vpaddd %ymm5,%ymm11,%ymm11 - vmovdqu 224-128(%rdi),%ymm5 - vpand %ymm6,%ymm15,%ymm15 - vpaddd %ymm0,%ymm12,%ymm12 - vpaddd %ymm1,%ymm13,%ymm13 - vmovdqu %ymm8,0-128(%rdi) - vpaddd %ymm2,%ymm14,%ymm14 - vmovdqu %ymm9,32-128(%rdi) - vpaddd %ymm5,%ymm15,%ymm15 - vmovdqu %ymm10,64-128(%rdi) - vmovdqu %ymm11,96-128(%rdi) - vmovdqu %ymm12,128-128(%rdi) - vmovdqu %ymm13,160-128(%rdi) - vmovdqu %ymm14,192-128(%rdi) - vmovdqu %ymm15,224-128(%rdi) - - vmovdqu %ymm7,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu .Lpbswap(%rip),%ymm6 - decl %edx - jnz .Loop_avx2 - - - - - - - -.Ldone_avx2: - movq 544(%rsp),%rax - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 .align 256 K256: .long 1116352408,1116352408,1116352408,1116352408 diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s index a2fbedaf8c6b9e..ab16a7b618820c 100644 --- a/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/sha/sha256-x86_64.s @@ -11,14 +11,6 @@ sha256_block_data_order: movl 8(%r11),%r11d testl $536870912,%r11d jnz _shaext_shortcut - andl $296,%r11d - cmpl $296,%r11d - je .Lavx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -1762,9 +1754,9 @@ _shaext_shortcut: movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 @@ -1783,7 +1775,7 @@ _shaext_shortcut: .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 @@ -1792,7 +1784,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 @@ -1801,7 +1793,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1813,7 +1805,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1824,7 +1816,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1835,7 +1827,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1846,7 +1838,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1857,7 +1849,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1868,7 +1860,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1879,7 +1871,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1890,7 +1882,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1901,7 +1893,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1912,7 +1904,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1923,7 +1915,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 @@ -1932,7 +1924,7 @@ _shaext_shortcut: movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 @@ -1941,7 +1933,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 nop .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 @@ -1950,9 +1942,9 @@ _shaext_shortcut: paddd %xmm9,%xmm1 jnz .Loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 @@ -3055,2304 +3047,3 @@ sha256_block_data_order_ssse3: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 -.type sha256_block_data_order_avx,@function -.align 64 -sha256_block_data_order_avx: -.Lavx_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -.Lprologue_avx: - - vzeroupper - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%xmm8 - vmovdqa K256+512+64(%rip),%xmm9 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%edi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%edi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-128,%rbp - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm3,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm0,%xmm0 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpshufd $80,%xmm0,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm0,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm1,%xmm1 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpshufd $80,%xmm1,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm1,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm2,%xmm2 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpshufd $80,%xmm2,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm2,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm3,%xmm3 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpshufd $80,%xmm3,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lavx_00_47 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_avx - - movq 64+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size sha256_block_data_order_avx,.-sha256_block_data_order_avx -.type sha256_block_data_order_avx2,@function -.align 64 -sha256_block_data_order_avx2: -.Lavx2_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $544,%rsp - shlq $4,%rdx - andq $-1024,%rsp - leaq (%rsi,%rdx,4),%rdx - addq $448,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -.Lprologue_avx2: - - vzeroupper - subq $-64,%rsi - movl 0(%rdi),%eax - movq %rsi,%r12 - movl 4(%rdi),%ebx - cmpq %rdx,%rsi - movl 8(%rdi),%ecx - cmoveq %rsp,%r12 - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%ymm8 - vmovdqa K256+512+64(%rip),%ymm9 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi),%xmm0 - vmovdqu -64+16(%rsi),%xmm1 - vmovdqu -64+32(%rsi),%xmm2 - vmovdqu -64+48(%rsi),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - leaq -64(%rsp),%rsp - movl %ebx,%edi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%edi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - leaq -64(%rsp),%rsp - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm0,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 0(%rbp),%ymm0,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm1,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 32(%rbp),%ymm1,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm2,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 64(%rbp),%ymm2,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm3,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 96(%rbp),%ymm3,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne .Lavx2_00_47 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rbp - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - cmpq 80(%rbp),%rsi - je .Ldone_avx2 - - xorl %r14d,%r14d - movl %ebx,%edi - xorl %ecx,%edi - movl %r9d,%r12d - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rsp - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - leaq 128(%rsi),%rsi - addl 24(%rdi),%r10d - movq %rsi,%r12 - addl 28(%rdi),%r11d - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - cmoveq %rsp,%r12 - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - -.Ldone_avx2: - leaq (%rbp),%rsp - movq 64+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 diff --git a/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s b/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s index a1021c17a966b8..f6638db30e9fad 100644 --- a/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s +++ b/deps/openssl/asm/x64-elf-gas/sha/sha512-x86_64.s @@ -5,20 +5,6 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $2048,%r10d - jnz .Lxop_shortcut - andl $296,%r11d - cmpl $296,%r11d - je .Lavx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1795,3571 +1781,3 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_xop,@function -.align 64 -sha512_block_data_order_xop: -.Lxop_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -.Lprologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_xop -.align 16 -.Lloop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lxop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_xop: - .byte 0xf3,0xc3 -.size sha512_block_data_order_xop,.-sha512_block_data_order_xop -.type sha512_block_data_order_avx,@function -.align 64 -sha512_block_data_order_avx: -.Lavx_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -.Lprologue_avx: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm0,%xmm0 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 0(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm7,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm7,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm7,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 8(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm0,%xmm0 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm1,%xmm1 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 16(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm0,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm0,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm0,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 24(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm1,%xmm1 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm2,%xmm2 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 32(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm1,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm1,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm1,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 40(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm2,%xmm2 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm3,%xmm3 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 48(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm2,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm2,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm2,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 56(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm3,%xmm3 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm4,%xmm4 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 64(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm3,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm3,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm3,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 72(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm4,%xmm4 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm5,%xmm5 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 80(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm4,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm4,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm4,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 88(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm5,%xmm5 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm6,%xmm6 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 96(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm5,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm5,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm5,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 104(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm6,%xmm6 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm7,%xmm7 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 112(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm6,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm6,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm6,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 120(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm7,%xmm7 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lavx_00_47 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_avx - - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.size sha512_block_data_order_avx,.-sha512_block_data_order_avx -.type sha512_block_data_order_avx2,@function -.align 64 -sha512_block_data_order_avx2: -.Lavx2_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $1312,%rsp - shlq $4,%rdx - andq $-2048,%rsp - leaq (%rsi,%rdx,8),%rdx - addq $1152,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -.Lprologue_avx2: - - vzeroupper - subq $-128,%rsi - movq 0(%rdi),%rax - movq %rsi,%r12 - movq 8(%rdi),%rbx - cmpq %rdx,%rsi - movq 16(%rdi),%rcx - cmoveq %rsp,%r12 - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Loop_avx2 -.align 16 -.Loop_avx2: - vmovdqu -128(%rsi),%xmm0 - vmovdqu -128+16(%rsi),%xmm1 - vmovdqu -128+32(%rsi),%xmm2 - leaq K512+128(%rip),%rbp - vmovdqu -128+48(%rsi),%xmm3 - vmovdqu -128+64(%rsi),%xmm4 - vmovdqu -128+80(%rsi),%xmm5 - vmovdqu -128+96(%rsi),%xmm6 - vmovdqu -128+112(%rsi),%xmm7 - - vmovdqa 1152(%rbp),%ymm10 - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm10,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm10,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - vpshufb %ymm10,%ymm2,%ymm2 - vinserti128 $1,64(%r12),%ymm4,%ymm4 - vpshufb %ymm10,%ymm3,%ymm3 - vinserti128 $1,80(%r12),%ymm5,%ymm5 - vpshufb %ymm10,%ymm4,%ymm4 - vinserti128 $1,96(%r12),%ymm6,%ymm6 - vpshufb %ymm10,%ymm5,%ymm5 - vinserti128 $1,112(%r12),%ymm7,%ymm7 - - vpaddq -128(%rbp),%ymm0,%ymm8 - vpshufb %ymm10,%ymm6,%ymm6 - vpaddq -96(%rbp),%ymm1,%ymm9 - vpshufb %ymm10,%ymm7,%ymm7 - vpaddq -64(%rbp),%ymm2,%ymm10 - vpaddq -32(%rbp),%ymm3,%ymm11 - vmovdqa %ymm8,0(%rsp) - vpaddq 0(%rbp),%ymm4,%ymm8 - vmovdqa %ymm9,32(%rsp) - vpaddq 32(%rbp),%ymm5,%ymm9 - vmovdqa %ymm10,64(%rsp) - vpaddq 64(%rbp),%ymm6,%ymm10 - vmovdqa %ymm11,96(%rsp) - leaq -128(%rsp),%rsp - vpaddq 96(%rbp),%ymm7,%ymm11 - vmovdqa %ymm8,0(%rsp) - xorq %r14,%r14 - vmovdqa %ymm9,32(%rsp) - movq %rbx,%rdi - vmovdqa %ymm10,64(%rsp) - xorq %rcx,%rdi - vmovdqa %ymm11,96(%rsp) - movq %r9,%r12 - addq $32*8,%rbp - jmp .Lavx2_00_47 - -.align 16 -.Lavx2_00_47: - leaq -128(%rsp),%rsp - vpalignr $8,%ymm0,%ymm1,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm4,%ymm5,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm0,%ymm0 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm7,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm7,%ymm10 - vpaddq %ymm8,%ymm0,%ymm0 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm7,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm0,%ymm0 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq -128(%rbp),%ymm0,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm5,%ymm6,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm1,%ymm1 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm0,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm0,%ymm10 - vpaddq %ymm8,%ymm1,%ymm1 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm0,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm1,%ymm1 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq -96(%rbp),%ymm1,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm6,%ymm7,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm2,%ymm2 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm1,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm1,%ymm10 - vpaddq %ymm8,%ymm2,%ymm2 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm1,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm2,%ymm2 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq -64(%rbp),%ymm2,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm7,%ymm0,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm3,%ymm3 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm2,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm2,%ymm10 - vpaddq %ymm8,%ymm3,%ymm3 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm2,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm3,%ymm3 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq -32(%rbp),%ymm3,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq -128(%rsp),%rsp - vpalignr $8,%ymm4,%ymm5,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm0,%ymm1,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm4,%ymm4 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm3,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm3,%ymm10 - vpaddq %ymm8,%ymm4,%ymm4 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm3,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm4,%ymm4 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq 0(%rbp),%ymm4,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm5,%ymm6,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm1,%ymm2,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm5,%ymm5 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm4,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm4,%ymm10 - vpaddq %ymm8,%ymm5,%ymm5 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm4,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm5,%ymm5 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq 32(%rbp),%ymm5,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm6,%ymm7,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm2,%ymm3,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm6,%ymm6 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm5,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm5,%ymm10 - vpaddq %ymm8,%ymm6,%ymm6 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm5,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm6,%ymm6 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq 64(%rbp),%ymm6,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm7,%ymm0,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm3,%ymm4,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm7,%ymm7 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm6,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm6,%ymm10 - vpaddq %ymm8,%ymm7,%ymm7 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm6,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm7,%ymm7 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq 96(%rbp),%ymm7,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq 256(%rbp),%rbp - cmpb $0,-121(%rbp) - jne .Lavx2_00_47 - addq 0+128(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+128(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+128(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+128(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+128(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+128(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+128(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+128(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - addq 0(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rbp - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - cmpq 144(%rbp),%rsi - je .Ldone_avx2 - - xorq %r14,%r14 - movq %rbx,%rdi - xorq %rcx,%rdi - movq %r9,%r12 - jmp .Lower_avx2 -.align 16 -.Lower_avx2: - addq 0+16(%rbp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+16(%rbp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+16(%rbp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+16(%rbp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+16(%rbp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+16(%rbp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+16(%rbp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+16(%rbp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - leaq -128(%rbp),%rbp - cmpq %rsp,%rbp - jae .Lower_avx2 - - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rsp - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - leaq 256(%rsi),%rsi - addq 48(%rdi),%r10 - movq %rsi,%r12 - addq 56(%rdi),%r11 - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - cmoveq %rsp,%r12 - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - jbe .Loop_avx2 - leaq (%rsp),%rbp - -.Ldone_avx2: - leaq (%rbp),%rsp - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s index ccd3c7090025ff..e45c622a525e82 100644 --- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-mb-x86_64.s @@ -6,14 +6,6 @@ .p2align 5 _aesni_multi_cbc_encrypt: - cmpl $2,%edx - jb L$enc_non_avx - movl _OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_enc_shortcut - jmp L$enc_non_avx -.p2align 4 -L$enc_non_avx: movq %rsp,%rax pushq %rbx pushq %rbp @@ -270,14 +262,6 @@ L$enc4x_epilogue: .p2align 5 _aesni_multi_cbc_decrypt: - cmpl $2,%edx - jb L$dec_non_avx - movl _OPENSSL_ia32cap_P+4(%rip),%ecx - testl $268435456,%ecx - jnz _avx_cbc_dec_shortcut - jmp L$dec_non_avx -.p2align 4 -L$dec_non_avx: movq %rsp,%rax pushq %rbx pushq %rbp @@ -519,916 +503,3 @@ L$dec4x_done: leaq (%rax),%rsp L$dec4x_epilogue: .byte 0xf3,0xc3 - - -.p2align 5 -aesni_multi_cbc_encrypt_avx: -_avx_cbc_enc_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - - - - - - - - subq $192,%rsp - andq $-128,%rsp - movq %rax,16(%rsp) - -L$enc8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -L$enc8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - testl %edx,%edx - jz L$enc8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - - vpxor (%r8),%xmm15,%xmm10 - leaq 128(%rsp),%rbp - vpxor (%r9),%xmm15,%xmm11 - vpxor (%r10),%xmm15,%xmm12 - vpxor (%r11),%xmm15,%xmm13 - vpxor %xmm10,%xmm2,%xmm2 - vpxor (%r12),%xmm15,%xmm10 - vpxor %xmm11,%xmm3,%xmm3 - vpxor (%r13),%xmm15,%xmm11 - vpxor %xmm12,%xmm4,%xmm4 - vpxor (%r14),%xmm15,%xmm12 - vpxor %xmm13,%xmm5,%xmm5 - vpxor (%r15),%xmm15,%xmm13 - vpxor %xmm10,%xmm6,%xmm6 - movl $1,%ecx - vpxor %xmm11,%xmm7,%xmm7 - vpxor %xmm12,%xmm8,%xmm8 - vpxor %xmm13,%xmm9,%xmm9 - jmp L$oop_enc8x - -.p2align 5 -L$oop_enc8x: - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r8),%xmm15,%xmm10 - movq %rbx,64+0(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,0(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r9),%xmm15,%xmm11 - movq %rbx,64+8(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,16(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r10),%xmm15,%xmm12 - movq %rbx,64+16(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,32(%rbp) - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r11),%xmm15,%xmm13 - movq %rbx,64+24(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,48(%rbp) - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r12),%xmm15,%xmm10 - movq %rbx,64+32(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r13),%xmm15,%xmm11 - movq %rbx,64+40(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesenc %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesenc %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesenc %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesenc %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesenc %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vpxor 16(%r14),%xmm15,%xmm12 - movq %rbx,64+48(%rsp) - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesenc %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesenc %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesenc %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesenc %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesenc %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesenc %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesenc %xmm0,%xmm8,%xmm8 - vpxor 16(%r15),%xmm15,%xmm13 - movq %rbx,64+56(%rsp) - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb L$enc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je L$enc8x_tail - - vaesenc %xmm1,%xmm2,%xmm2 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesenc %xmm0,%xmm2,%xmm2 - vaesenc %xmm0,%xmm3,%xmm3 - vaesenc %xmm0,%xmm4,%xmm4 - vaesenc %xmm0,%xmm5,%xmm5 - vaesenc %xmm0,%xmm6,%xmm6 - vaesenc %xmm0,%xmm7,%xmm7 - vaesenc %xmm0,%xmm8,%xmm8 - vaesenc %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -L$enc8x_tail: - vaesenc %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesenc %xmm1,%xmm3,%xmm3 - vaesenc %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenc %xmm1,%xmm5,%xmm5 - vaesenc %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesenc %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesenc %xmm1,%xmm8,%xmm8 - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesenclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesenclast %xmm0,%xmm3,%xmm3 - vaesenclast %xmm0,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesenclast %xmm0,%xmm5,%xmm5 - vaesenclast %xmm0,%xmm6,%xmm6 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesenclast %xmm0,%xmm7,%xmm7 - vaesenclast %xmm0,%xmm8,%xmm8 - vmovdqa %xmm14,48(%rsp) - vaesenclast %xmm0,%xmm9,%xmm9 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vpxor 0(%rbp),%xmm2,%xmm2 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vpxor 16(%rbp),%xmm3,%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vpxor 32(%rbp),%xmm4,%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vpxor 48(%rbp),%xmm5,%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vpxor %xmm10,%xmm6,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vpxor %xmm11,%xmm7,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vpxor %xmm12,%xmm8,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vpxor %xmm13,%xmm9,%xmm9 - - decl %edx - jnz L$oop_enc8x - - movq 16(%rsp),%rax - - - - - -L$enc8x_done: - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$enc8x_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -aesni_multi_cbc_decrypt_avx: -_avx_cbc_dec_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - - - - - - - - - subq $256,%rsp - andq $-256,%rsp - subq $192,%rsp - movq %rax,16(%rsp) - -L$dec8x_body: - vzeroupper - vmovdqu (%rsi),%xmm15 - leaq 120(%rsi),%rsi - leaq 160(%rdi),%rdi - shrl $1,%edx - -L$dec8x_loop_grande: - - xorl %edx,%edx - movl -144(%rdi),%ecx - movq -160(%rdi),%r8 - cmpl %edx,%ecx - movq -152(%rdi),%rbx - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -136(%rdi),%xmm2 - movl %ecx,32(%rsp) - cmovleq %rsp,%r8 - subq %r8,%rbx - movq %rbx,64(%rsp) - vmovdqu %xmm2,192(%rsp) - movl -104(%rdi),%ecx - movq -120(%rdi),%r9 - cmpl %edx,%ecx - movq -112(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -96(%rdi),%xmm3 - movl %ecx,36(%rsp) - cmovleq %rsp,%r9 - subq %r9,%rbp - movq %rbp,72(%rsp) - vmovdqu %xmm3,208(%rsp) - movl -64(%rdi),%ecx - movq -80(%rdi),%r10 - cmpl %edx,%ecx - movq -72(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -56(%rdi),%xmm4 - movl %ecx,40(%rsp) - cmovleq %rsp,%r10 - subq %r10,%rbp - movq %rbp,80(%rsp) - vmovdqu %xmm4,224(%rsp) - movl -24(%rdi),%ecx - movq -40(%rdi),%r11 - cmpl %edx,%ecx - movq -32(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu -16(%rdi),%xmm5 - movl %ecx,44(%rsp) - cmovleq %rsp,%r11 - subq %r11,%rbp - movq %rbp,88(%rsp) - vmovdqu %xmm5,240(%rsp) - movl 16(%rdi),%ecx - movq 0(%rdi),%r12 - cmpl %edx,%ecx - movq 8(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 24(%rdi),%xmm6 - movl %ecx,48(%rsp) - cmovleq %rsp,%r12 - subq %r12,%rbp - movq %rbp,96(%rsp) - vmovdqu %xmm6,256(%rsp) - movl 56(%rdi),%ecx - movq 40(%rdi),%r13 - cmpl %edx,%ecx - movq 48(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 64(%rdi),%xmm7 - movl %ecx,52(%rsp) - cmovleq %rsp,%r13 - subq %r13,%rbp - movq %rbp,104(%rsp) - vmovdqu %xmm7,272(%rsp) - movl 96(%rdi),%ecx - movq 80(%rdi),%r14 - cmpl %edx,%ecx - movq 88(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 104(%rdi),%xmm8 - movl %ecx,56(%rsp) - cmovleq %rsp,%r14 - subq %r14,%rbp - movq %rbp,112(%rsp) - vmovdqu %xmm8,288(%rsp) - movl 136(%rdi),%ecx - movq 120(%rdi),%r15 - cmpl %edx,%ecx - movq 128(%rdi),%rbp - cmovgl %ecx,%edx - testl %ecx,%ecx - vmovdqu 144(%rdi),%xmm9 - movl %ecx,60(%rsp) - cmovleq %rsp,%r15 - subq %r15,%rbp - movq %rbp,120(%rsp) - vmovdqu %xmm9,304(%rsp) - testl %edx,%edx - jz L$dec8x_done - - vmovups 16-120(%rsi),%xmm1 - vmovups 32-120(%rsi),%xmm0 - movl 240-120(%rsi),%eax - leaq 192+128(%rsp),%rbp - - vmovdqu (%r8),%xmm2 - vmovdqu (%r9),%xmm3 - vmovdqu (%r10),%xmm4 - vmovdqu (%r11),%xmm5 - vmovdqu (%r12),%xmm6 - vmovdqu (%r13),%xmm7 - vmovdqu (%r14),%xmm8 - vmovdqu (%r15),%xmm9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm6,64(%rbp) - vpxor %xmm15,%xmm6,%xmm6 - vmovdqu %xmm7,80(%rbp) - vpxor %xmm15,%xmm7,%xmm7 - vmovdqu %xmm8,96(%rbp) - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu %xmm9,112(%rbp) - vpxor %xmm15,%xmm9,%xmm9 - xorq $128,%rbp - movl $1,%ecx - jmp L$oop_dec8x - -.p2align 5 -L$oop_dec8x: - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+0(%rsp),%ecx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r8) - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r8,%rbx,1),%rbx - cmovgeq %rsp,%r8 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r8,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r8),%xmm10 - movq %rbx,64+0(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -72(%rsi),%xmm1 - leaq 16(%r8,%rbx,1),%r8 - vmovdqu %xmm10,128(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+4(%rsp),%ecx - movq 64+8(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r9) - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r9,%rbx,1),%rbx - cmovgeq %rsp,%r9 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r9,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r9),%xmm11 - movq %rbx,64+8(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -56(%rsi),%xmm0 - leaq 16(%r9,%rbx,1),%r9 - vmovdqu %xmm11,144(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+8(%rsp),%ecx - movq 64+16(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r10) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r8) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r10,%rbx,1),%rbx - cmovgeq %rsp,%r10 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r10,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r10),%xmm12 - movq %rbx,64+16(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -40(%rsi),%xmm1 - leaq 16(%r10,%rbx,1),%r10 - vmovdqu %xmm12,160(%rsp) - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+12(%rsp),%ecx - movq 64+24(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r11) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r9) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r11,%rbx,1),%rbx - cmovgeq %rsp,%r11 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r11,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r11),%xmm13 - movq %rbx,64+24(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups -24(%rsi),%xmm0 - leaq 16(%r11,%rbx,1),%r11 - vmovdqu %xmm13,176(%rsp) - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+16(%rsp),%ecx - movq 64+32(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r12) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r10) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r12,%rbx,1),%rbx - cmovgeq %rsp,%r12 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r12,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r12),%xmm10 - movq %rbx,64+32(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups -8(%rsi),%xmm1 - leaq 16(%r12,%rbx,1),%r12 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+20(%rsp),%ecx - movq 64+40(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r13) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r11) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%rbx,%r13,1),%rbx - cmovgeq %rsp,%r13 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r13,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r13),%xmm11 - movq %rbx,64+40(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 8(%rsi),%xmm0 - leaq 16(%r13,%rbx,1),%r13 - vaesdec %xmm1,%xmm2,%xmm2 - cmpl 32+24(%rsp),%ecx - movq 64+48(%rsp),%rbx - vaesdec %xmm1,%xmm3,%xmm3 - prefetcht0 31(%r14) - vaesdec %xmm1,%xmm4,%xmm4 - prefetcht0 15(%r12) - vaesdec %xmm1,%xmm5,%xmm5 - leaq (%r14,%rbx,1),%rbx - cmovgeq %rsp,%r14 - vaesdec %xmm1,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm1,%xmm7,%xmm7 - subq %r14,%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vmovdqu 16(%r14),%xmm12 - movq %rbx,64+48(%rsp) - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 24(%rsi),%xmm1 - leaq 16(%r14,%rbx,1),%r14 - vaesdec %xmm0,%xmm2,%xmm2 - cmpl 32+28(%rsp),%ecx - movq 64+56(%rsp),%rbx - vaesdec %xmm0,%xmm3,%xmm3 - prefetcht0 31(%r15) - vaesdec %xmm0,%xmm4,%xmm4 - prefetcht0 15(%r13) - vaesdec %xmm0,%xmm5,%xmm5 - leaq (%r15,%rbx,1),%rbx - cmovgeq %rsp,%r15 - vaesdec %xmm0,%xmm6,%xmm6 - cmovgq %rsp,%rbx - vaesdec %xmm0,%xmm7,%xmm7 - subq %r15,%rbx - vaesdec %xmm0,%xmm8,%xmm8 - vmovdqu 16(%r15),%xmm13 - movq %rbx,64+56(%rsp) - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 40(%rsi),%xmm0 - leaq 16(%r15,%rbx,1),%r15 - vmovdqu 32(%rsp),%xmm14 - prefetcht0 15(%r14) - prefetcht0 15(%r15) - cmpl $11,%eax - jb L$dec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 176-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 192-120(%rsi),%xmm0 - je L$dec8x_tail - - vaesdec %xmm1,%xmm2,%xmm2 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vaesdec %xmm1,%xmm7,%xmm7 - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 208-120(%rsi),%xmm1 - - vaesdec %xmm0,%xmm2,%xmm2 - vaesdec %xmm0,%xmm3,%xmm3 - vaesdec %xmm0,%xmm4,%xmm4 - vaesdec %xmm0,%xmm5,%xmm5 - vaesdec %xmm0,%xmm6,%xmm6 - vaesdec %xmm0,%xmm7,%xmm7 - vaesdec %xmm0,%xmm8,%xmm8 - vaesdec %xmm0,%xmm9,%xmm9 - vmovups 224-120(%rsi),%xmm0 - -L$dec8x_tail: - vaesdec %xmm1,%xmm2,%xmm2 - vpxor %xmm15,%xmm15,%xmm15 - vaesdec %xmm1,%xmm3,%xmm3 - vaesdec %xmm1,%xmm4,%xmm4 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdec %xmm1,%xmm5,%xmm5 - vaesdec %xmm1,%xmm6,%xmm6 - vpaddd %xmm14,%xmm15,%xmm15 - vmovdqu 48(%rsp),%xmm14 - vaesdec %xmm1,%xmm7,%xmm7 - movq 64(%rsp),%rbx - vaesdec %xmm1,%xmm8,%xmm8 - vaesdec %xmm1,%xmm9,%xmm9 - vmovups 16-120(%rsi),%xmm1 - - vaesdeclast %xmm0,%xmm2,%xmm2 - vmovdqa %xmm15,32(%rsp) - vpxor %xmm15,%xmm15,%xmm15 - vaesdeclast %xmm0,%xmm3,%xmm3 - vpxor 0(%rbp),%xmm2,%xmm2 - vaesdeclast %xmm0,%xmm4,%xmm4 - vpxor 16(%rbp),%xmm3,%xmm3 - vpcmpgtd %xmm15,%xmm14,%xmm15 - vaesdeclast %xmm0,%xmm5,%xmm5 - vpxor 32(%rbp),%xmm4,%xmm4 - vaesdeclast %xmm0,%xmm6,%xmm6 - vpxor 48(%rbp),%xmm5,%xmm5 - vpaddd %xmm15,%xmm14,%xmm14 - vmovdqu -120(%rsi),%xmm15 - vaesdeclast %xmm0,%xmm7,%xmm7 - vpxor 64(%rbp),%xmm6,%xmm6 - vaesdeclast %xmm0,%xmm8,%xmm8 - vpxor 80(%rbp),%xmm7,%xmm7 - vmovdqa %xmm14,48(%rsp) - vaesdeclast %xmm0,%xmm9,%xmm9 - vpxor 96(%rbp),%xmm8,%xmm8 - vmovups 32-120(%rsi),%xmm0 - - vmovups %xmm2,-16(%r8) - subq %rbx,%r8 - vmovdqu 128+0(%rsp),%xmm2 - vpxor 112(%rbp),%xmm9,%xmm9 - vmovups %xmm3,-16(%r9) - subq 72(%rsp),%r9 - vmovdqu %xmm2,0(%rbp) - vpxor %xmm15,%xmm2,%xmm2 - vmovdqu 128+16(%rsp),%xmm3 - vmovups %xmm4,-16(%r10) - subq 80(%rsp),%r10 - vmovdqu %xmm3,16(%rbp) - vpxor %xmm15,%xmm3,%xmm3 - vmovdqu 128+32(%rsp),%xmm4 - vmovups %xmm5,-16(%r11) - subq 88(%rsp),%r11 - vmovdqu %xmm4,32(%rbp) - vpxor %xmm15,%xmm4,%xmm4 - vmovdqu 128+48(%rsp),%xmm5 - vmovups %xmm6,-16(%r12) - subq 96(%rsp),%r12 - vmovdqu %xmm5,48(%rbp) - vpxor %xmm15,%xmm5,%xmm5 - vmovdqu %xmm10,64(%rbp) - vpxor %xmm10,%xmm15,%xmm6 - vmovups %xmm7,-16(%r13) - subq 104(%rsp),%r13 - vmovdqu %xmm11,80(%rbp) - vpxor %xmm11,%xmm15,%xmm7 - vmovups %xmm8,-16(%r14) - subq 112(%rsp),%r14 - vmovdqu %xmm12,96(%rbp) - vpxor %xmm12,%xmm15,%xmm8 - vmovups %xmm9,-16(%r15) - subq 120(%rsp),%r15 - vmovdqu %xmm13,112(%rbp) - vpxor %xmm13,%xmm15,%xmm9 - - xorq $128,%rbp - decl %edx - jnz L$oop_dec8x - - movq 16(%rsp),%rax - - - - - -L$dec8x_done: - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$dec8x_epilogue: - .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s index c7606aec49a95b..970a12149bd241 100644 --- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha1-x86_64.s @@ -10,11 +10,6 @@ _aesni_cbc_sha1_enc: movq _OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext - andl $268435456,%r11d - andl $1073741824,%r10d - orl %r11d,%r10d - cmpl $1342177280,%r10d - je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 @@ -1372,1304 +1367,6 @@ L$aesenclast5: L$epilogue_ssse3: .byte 0xf3,0xc3 - -.p2align 5 -aesni_cbc_sha1_enc_avx: - movq 8(%rsp),%r10 - - - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - leaq -104(%rsp),%rsp - - - vzeroall - movq %rdi,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - leaq 112(%rcx),%r15 - vmovdqu (%r8),%xmm12 - movq %r8,88(%rsp) - shlq $6,%r14 - subq %r12,%r13 - movl 240-112(%r15),%r8d - addq %r10,%r14 - - leaq K_XX_XX(%rip),%r11 - movl 0(%r9),%eax - movl 4(%r9),%ebx - movl 8(%r9),%ecx - movl 12(%r9),%edx - movl %ebx,%esi - movl 16(%r9),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r10 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm10,%xmm0,%xmm4 - vpaddd %xmm10,%xmm1,%xmm5 - vpaddd %xmm10,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - jmp L$oop_avx -.p2align 5 -L$oop_avx: - shrdl $2,%ebx,%ebx - vmovdqu 0(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm10,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm9 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpor %xmm8,%xmm4,%xmm4 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - vpxor %xmm9,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm10,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm9 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpor %xmm8,%xmm5,%xmm5 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm9,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa 16(%r11),%xmm10 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpaddd %xmm5,%xmm10,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm9 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpor %xmm8,%xmm6,%xmm6 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm9,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm10,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm9 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpor %xmm8,%xmm7,%xmm7 - vpsrld $30,%xmm9,%xmm8 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - cmpl $11,%r8d - jb L$vaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je L$vaesenclast6 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -L$vaesenclast6: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm9,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm10,%xmm9 - addl %esi,%edx - vmovdqu 16(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,0(%r12,%r13,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm10,%xmm9 - vmovdqa 32(%r11),%xmm10 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - vpaddd %xmm3,%xmm10,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm10,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - cmpl $11,%r8d - jb L$vaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je L$vaesenclast7 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -L$vaesenclast7: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - vmovdqu 32(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,16(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm10,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm10,%xmm9 - vmovdqa 48(%r11),%xmm10 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm10,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm10,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - cmpl $11,%r8d - jb L$vaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je L$vaesenclast8 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -L$vaesenclast8: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm10,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vmovdqu 48(%r12),%xmm13 - vpxor %xmm15,%xmm13,%xmm13 - vmovups %xmm12,32(%r13,%r12,1) - vpxor %xmm13,%xmm12,%xmm12 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -80(%r15),%xmm15 - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -64(%r15),%xmm14 - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -48(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm10,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups -32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm10,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups -16(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 0(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r14,%r10 - je L$done_avx - vmovdqa 64(%r11),%xmm9 - vmovdqa 0(%r11),%xmm10 - vmovdqu 0(%r10),%xmm0 - vmovdqu 16(%r10),%xmm1 - vmovdqu 32(%r10),%xmm2 - vmovdqu 48(%r10),%xmm3 - vpshufb %xmm9,%xmm0,%xmm0 - addq $64,%r10 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm9,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm10,%xmm0,%xmm8 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm8,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm9,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm10,%xmm1,%xmm8 - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm8,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm9,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm10,%xmm2,%xmm8 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm8,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb L$vaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je L$vaesenclast9 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -L$vaesenclast9: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - leaq 64(%r12),%r12 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - addl 12(%r9),%edx - movl %eax,0(%r9) - addl 16(%r9),%ebp - movl %esi,4(%r9) - movl %esi,%ebx - movl %ecx,8(%r9) - movl %ecx,%edi - movl %edx,12(%r9) - xorl %edx,%edi - movl %ebp,16(%r9) - andl %edi,%esi - jmp L$oop_avx - -L$done_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 16(%r15),%xmm15 - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 32(%r15),%xmm14 - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 48(%r15),%xmm15 - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - cmpl $11,%r8d - jb L$vaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 64(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 80(%r15),%xmm15 - je L$vaesenclast10 - vaesenc %xmm15,%xmm12,%xmm12 - vmovups 96(%r15),%xmm14 - vaesenc %xmm14,%xmm12,%xmm12 - vmovups 112(%r15),%xmm15 -L$vaesenclast10: - vaesenclast %xmm15,%xmm12,%xmm12 - vmovups -112(%r15),%xmm15 - vmovups 16-112(%r15),%xmm14 - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vmovups %xmm12,48(%r13,%r12,1) - movq 88(%rsp),%r8 - - addl 0(%r9),%eax - addl 4(%r9),%esi - addl 8(%r9),%ecx - movl %eax,0(%r9) - addl 12(%r9),%edx - movl %esi,4(%r9) - addl 16(%r9),%ebp - movl %ecx,8(%r9) - movl %edx,12(%r9) - movl %ebp,16(%r9) - vmovups %xmm12,(%r8) - vzeroall - leaq 104(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2695,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext: movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 jmp L$oop_shaext .p2align 4 @@ -2759,17 +1456,17 @@ L$oop_shaext: pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb L$aesenclast11 + jb L$aesenclast6 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je L$aesenclast11 + je L$aesenclast6 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -L$aesenclast11: +L$aesenclast6: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2825,17 +1522,17 @@ L$aesenclast11: pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb L$aesenclast12 + jb L$aesenclast7 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je L$aesenclast12 + je L$aesenclast7 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -L$aesenclast12: +L$aesenclast7: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 @@ -2891,17 +1588,17 @@ L$aesenclast12: pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb L$aesenclast13 + jb L$aesenclast8 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je L$aesenclast13 + je L$aesenclast8 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -L$aesenclast13: +L$aesenclast8: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 @@ -2955,17 +1652,17 @@ L$aesenclast13: movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb L$aesenclast14 + jb L$aesenclast9 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je L$aesenclast14 + je L$aesenclast9 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -L$aesenclast14: +L$aesenclast9: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx @@ -2975,8 +1672,8 @@ L$aesenclast14: leaq 64(%rdi),%rdi jnz L$oop_shaext - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) diff --git a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s index 7d8bdff63499df..53307da35815e1 100644 --- a/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/aes/aesni-sha256-x86_64.s @@ -5,25 +5,6 @@ .p2align 4 _aesni_cbc_sha256_enc: - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl $1,%eax - cmpq $0,%rdi - je L$probe - movl 0(%r11),%eax - movq 4(%r11),%r10 - btq $61,%r10 - jc aesni_cbc_sha256_enc_shaext - movq %r10,%r11 - shrq $32,%r11 - - testl $2048,%r10d - jnz aesni_cbc_sha256_enc_xop - andl $296,%r11d - cmpl $296,%r11d - je aesni_cbc_sha256_enc_avx2 - andl $268435456,%r10d - jnz aesni_cbc_sha256_enc_avx - ud2 xorl %eax,%eax cmpq $0,%rdi je L$probe @@ -74,4280 +55,3 @@ K256: .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 6 - -.p2align 6 -aesni_cbc_sha256_enc_xop: -L$xop_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -L$prologue_xop: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp L$loop_xop -.p2align 4 -L$loop_xop: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$xop_00_47 - -.p2align 4 -L$xop_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm2,%xmm3,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm0,%xmm0 - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,251,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm3,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm0,%xmm0 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,248,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm0,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm0,%xmm0 - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm3,%xmm0,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm1,%xmm1 - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,248,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm0,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm1,%xmm1 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,249,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm1,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm1,%xmm1 - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorl $14,%r13d - movl %r14d,%eax - vpalignr $4,%xmm0,%xmm1,%xmm7 - movl %r9d,%r12d - xorl %r8d,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %r10d,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %eax,%r14d - vpaddd %xmm7,%xmm2,%xmm2 - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %r10d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi -.byte 143,232,120,194,249,13 - xorl %eax,%r14d - addl %r13d,%r11d - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%esi - addl %r11d,%edx - vpsrld $10,%xmm1,%xmm6 - rorl $2,%r14d - addl %esi,%r11d - vpaddd %xmm4,%xmm2,%xmm2 - movl %edx,%r13d - addl %r11d,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%r11d - vpxor %xmm6,%xmm7,%xmm7 - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d -.byte 143,232,120,194,250,13 - xorl %r11d,%r14d - addl %r13d,%r10d - vpsrld $10,%xmm2,%xmm6 - xorl %eax,%r15d - addl %r10d,%ecx -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%r10d - vpxor %xmm6,%xmm7,%xmm7 - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - vpxor %xmm5,%xmm7,%xmm7 - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - vpaddd %xmm7,%xmm2,%xmm2 - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorl $14,%r13d - movl %r14d,%r8d - vpalignr $4,%xmm1,%xmm2,%xmm7 - movl %ebx,%r12d - xorl %eax,%r13d -.byte 143,232,120,194,236,14 - rorl $9,%r14d - xorl %ecx,%r12d - vpsrld $3,%xmm4,%xmm4 - rorl $5,%r13d - xorl %r8d,%r14d - vpaddd %xmm7,%xmm3,%xmm3 - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d -.byte 143,232,120,194,245,11 - rorl $11,%r14d - xorl %ecx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi -.byte 143,232,120,194,250,13 - xorl %r8d,%r14d - addl %r13d,%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %r9d,%esi - addl %edx,%r11d - vpsrld $10,%xmm2,%xmm6 - rorl $2,%r14d - addl %esi,%edx - vpaddd %xmm4,%xmm3,%xmm3 - movl %r11d,%r13d - addl %edx,%r14d -.byte 143,232,120,194,239,2 - rorl $14,%r13d - movl %r14d,%edx - vpxor %xmm6,%xmm7,%xmm7 - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - vpxor %xmm5,%xmm7,%xmm7 - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrldq $8,%xmm7,%xmm7 - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d -.byte 143,232,120,194,251,13 - xorl %edx,%r14d - addl %r13d,%ecx - vpsrld $10,%xmm3,%xmm6 - xorl %r8d,%r15d - addl %ecx,%r10d -.byte 143,232,120,194,239,2 - rorl $2,%r14d - addl %r15d,%ecx - vpxor %xmm6,%xmm7,%xmm7 - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - vpxor %xmm5,%xmm7,%xmm7 - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - vpslldq $8,%xmm7,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - vpaddd %xmm7,%xmm3,%xmm3 - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne L$xop_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - rorl $11,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - rorl $6,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - rorl $2,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - rorl $11,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - rorl $6,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - rorl $2,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - rorl $11,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - rorl $6,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - rorl $2,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - rorl $11,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - rorl $6,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - rorl $2,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - rorl $11,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - rorl $6,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - rorl $2,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - rorl $11,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - rorl $6,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - rorl $2,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - rorl $11,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - rorl $6,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - rorl $2,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - rorl $11,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - rorl $6,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - rorl $2,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jb L$loop_xop - - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_xop: - .byte 0xf3,0xc3 - - -.p2align 6 -aesni_cbc_sha256_enc_avx: -L$avx_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $128,%rsp - andq $-64,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -L$prologue_avx: - vzeroall - - movq %rdi,%r12 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r13 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - subq $9,%r14 - - movl 0(%r15),%eax - movl 4(%r15),%ebx - movl 8(%r15),%ecx - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - - vmovdqa 0(%r13,%r14,8),%xmm14 - vmovdqa 16(%r13,%r14,8),%xmm13 - vmovdqa 32(%r13,%r14,8),%xmm12 - vmovdqu 0-128(%rdi),%xmm10 - jmp L$loop_avx -.p2align 4 -L$loop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi,%r12,1),%xmm0 - vmovdqu 16(%rsi,%r12,1),%xmm1 - vmovdqu 32(%rsi,%r12,1),%xmm2 - vmovdqu 48(%rsi,%r12,1),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%esi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%esi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$avx_00_47 - -.p2align 4 -L$avx_00_47: - subq $-32*4,%rbp - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm3,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpaddd %xmm6,%xmm0,%xmm0 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm0,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 0(%rbp),%xmm0,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm0,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpaddd %xmm6,%xmm1,%xmm1 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm1,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 32(%rbp),%xmm1,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - vpshufd $250,%xmm1,%xmm7 - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpaddd %xmm6,%xmm2,%xmm2 - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - vpshufd $80,%xmm2,%xmm7 - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - vpsrlq $17,%xmm7,%xmm7 - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpslldq $8,%xmm6,%xmm6 - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - vpaddd 64(%rbp),%xmm2,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - vpshufd $250,%xmm2,%xmm7 - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - vpslld $11,%xmm5,%xmm5 - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - vpxor %xmm5,%xmm4,%xmm4 - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - vpshufd $132,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpsrldq $8,%xmm6,%xmm6 - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpaddd %xmm6,%xmm3,%xmm3 - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - vpshufd $80,%xmm3,%xmm7 - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - vpsrld $10,%xmm7,%xmm6 - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - vpsrlq $17,%xmm7,%xmm7 - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpsrlq $2,%xmm7,%xmm7 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - vpshufd $232,%xmm6,%xmm6 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpslldq $8,%xmm6,%xmm6 - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - vpaddd 96(%rbp),%xmm3,%xmm6 - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - movq 64+0(%rsp),%r12 - vpand %xmm14,%xmm11,%xmm11 - movq 64+8(%rsp),%r15 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r12,1) - leaq 16(%r12),%r12 - cmpb $0,131(%rbp) - jne L$avx_00_47 - vmovdqu (%r12),%xmm9 - movq %r12,64+0(%rsp) - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vpxor %xmm8,%xmm9,%xmm9 - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - xorl %r8d,%r13d - shrdl $9,%r14d,%r14d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - shrdl $11,%r14d,%r14d - xorl %r10d,%r12d - xorl %ebx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r11d - andl %r15d,%esi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%esi - addl %r11d,%edx - shrdl $2,%r14d,%r14d - addl %esi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - xorl %edx,%r13d - shrdl $9,%r14d,%r14d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%esi - shrdl $11,%r14d,%r14d - xorl %r9d,%r12d - xorl %eax,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r10d - andl %esi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - addl %r10d,%ecx - shrdl $2,%r14d,%r14d - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - xorl %ecx,%r13d - shrdl $9,%r14d,%r14d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - shrdl $11,%r14d,%r14d - xorl %r8d,%r12d - xorl %r11d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%r9d - andl %r15d,%esi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%esi - addl %r9d,%ebx - shrdl $2,%r14d,%r14d - addl %esi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - xorl %ebx,%r13d - shrdl $9,%r14d,%r14d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%esi - shrdl $11,%r14d,%r14d - xorl %edx,%r12d - xorl %r10d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%r8d - andl %esi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - addl %r8d,%eax - shrdl $2,%r14d,%r14d - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - xorl %eax,%r13d - shrdl $9,%r14d,%r14d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - shrdl $11,%r14d,%r14d - xorl %ecx,%r12d - xorl %r9d,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%edx - andl %r15d,%esi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%esi - addl %edx,%r11d - shrdl $2,%r14d,%r14d - addl %esi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - xorl %r11d,%r13d - shrdl $9,%r14d,%r14d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%esi - shrdl $11,%r14d,%r14d - xorl %ebx,%r12d - xorl %r8d,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%ecx - andl %esi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - addl %ecx,%r10d - shrdl $2,%r14d,%r14d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - xorl %r10d,%r13d - shrdl $9,%r14d,%r14d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - shrdl $11,%r14d,%r14d - xorl %eax,%r12d - xorl %edx,%r15d - shrdl $6,%r13d,%r13d - addl %r12d,%ebx - andl %r15d,%esi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%esi - addl %ebx,%r9d - shrdl $2,%r14d,%r14d - addl %esi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - xorl %r9d,%r13d - shrdl $9,%r14d,%r14d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%esi - shrdl $11,%r14d,%r14d - xorl %r11d,%r12d - xorl %ecx,%esi - shrdl $6,%r13d,%r13d - addl %r12d,%eax - andl %esi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - addl %eax,%r8d - shrdl $2,%r14d,%r14d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%r12 - movq 64+8(%rsp),%r13 - movq 64+40(%rsp),%r15 - movq 64+48(%rsp),%rsi - - vpand %xmm14,%xmm11,%xmm11 - movl %r14d,%eax - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r12),%r12 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r12 - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - jb L$loop_avx - - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 6 -aesni_cbc_sha256_enc_avx2: -L$avx2_shortcut: - movq 8(%rsp),%r10 - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $576,%rsp - andq $-1024,%rsp - addq $448,%rsp - - shlq $6,%rdx - subq %rdi,%rsi - subq %rdi,%r10 - addq %rdi,%rdx - - - - movq %rdx,64+16(%rsp) - - movq %r8,64+32(%rsp) - movq %r9,64+40(%rsp) - movq %r10,64+48(%rsp) - movq %r11,64+56(%rsp) -L$prologue_avx2: - vzeroall - - movq %rdi,%r13 - vpinsrq $1,%rsi,%xmm15,%xmm15 - leaq 128(%rcx),%rdi - leaq K256+544(%rip),%r12 - movl 240-128(%rdi),%r14d - movq %r9,%r15 - movq %r10,%rsi - vmovdqu (%r8),%xmm8 - leaq -9(%r14),%r14 - - vmovdqa 0(%r12,%r14,8),%xmm14 - vmovdqa 16(%r12,%r14,8),%xmm13 - vmovdqa 32(%r12,%r14,8),%xmm12 - - subq $-64,%r13 - movl 0(%r15),%eax - leaq (%rsi,%r13,1),%r12 - movl 4(%r15),%ebx - cmpq %rdx,%r13 - movl 8(%r15),%ecx - cmoveq %rsp,%r12 - movl 12(%r15),%edx - movl 16(%r15),%r8d - movl 20(%r15),%r9d - movl 24(%r15),%r10d - movl 28(%r15),%r11d - vmovdqu 0-128(%rdi),%xmm10 - jmp L$oop_avx2 -.p2align 4 -L$oop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi,%r13,1),%xmm0 - vmovdqu -64+16(%rsi,%r13,1),%xmm1 - vmovdqu -64+32(%rsi,%r13,1),%xmm2 - vmovdqu -64+48(%rsi,%r13,1),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - leaq -64(%r13),%r13 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - leaq -64(%rsp),%rsp - movl %ebx,%esi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%esi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp L$avx2_00_47 - -.p2align 4 -L$avx2_00_47: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - leaq -64(%rsp),%rsp - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm0,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm0,%ymm0 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 0(%rbp),%ymm0,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm1,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm1,%ymm1 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 32(%rbp),%ymm1,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpshufd $80,%ymm2,%ymm7 - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpsrlq $2,%ymm7,%ymm7 - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - vpaddd %ymm6,%ymm2,%ymm2 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - vpaddd 64(%rbp),%ymm2,%ymm6 - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufd $132,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpsrldq $8,%ymm6,%ymm6 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpshufd $80,%ymm3,%ymm7 - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - vpsrld $10,%ymm7,%ymm6 - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - vpsrlq $17,%ymm7,%ymm7 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpxor %ymm7,%ymm6,%ymm6 - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpsrlq $2,%ymm7,%ymm7 - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - vpxor %ymm7,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - vpshufd $232,%ymm6,%ymm6 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - vpslldq $8,%ymm6,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - vpaddd %ymm6,%ymm3,%ymm3 - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - vpaddd 96(%rbp),%ymm3,%ymm6 - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne L$avx2_00_47 - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vpextrq $1,%xmm15,%r12 - vmovq %xmm15,%r13 - movq 552(%rsp),%r15 - addl %r14d,%eax - leaq 448(%rsp),%rbp - - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - vmovdqu %xmm8,(%r12,%r13,1) - leaq 16(%r13),%r13 - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - addl 28(%r15),%r11d - - movl %eax,0(%r15) - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - cmpq 80(%rbp),%r13 - je L$done_avx2 - - xorl %r14d,%r14d - movl %ebx,%esi - movl %r9d,%r12d - xorl %ecx,%esi - jmp L$ower_avx2 -.p2align 4 -L$ower_avx2: - vmovdqu (%r13),%xmm9 - vpinsrq $0,%r13,%xmm15,%xmm15 - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vpxor %xmm10,%xmm9,%xmm9 - vmovdqu 16-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vpxor %xmm8,%xmm9,%xmm9 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 32-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 48-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 80-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 96-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 112-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 128-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ebx,%esi - xorl %r13d,%r14d - leal (%r11,%rsi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%esi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %esi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%esi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%esi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %esi,%r15d - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 144-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%esi - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r11d,%esi - xorl %r13d,%r14d - leal (%r9,%rsi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%esi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %esi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%esi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%esi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 176-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%esi - vpand %xmm12,%xmm11,%xmm8 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 192-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r9d,%esi - xorl %r13d,%r14d - leal (%rdx,%rsi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%esi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %esi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%esi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%esi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %esi,%r15d - vaesenclast %xmm10,%xmm9,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 208-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%esi - vpand %xmm13,%xmm11,%xmm11 - vaesenc %xmm10,%xmm9,%xmm9 - vmovdqu 224-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %edx,%esi - xorl %r13d,%r14d - leal (%rbx,%rsi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%esi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %esi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%esi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%esi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %esi,%r15d - vpor %xmm11,%xmm8,%xmm8 - vaesenclast %xmm10,%xmm9,%xmm11 - vmovdqu 0-128(%rdi),%xmm10 - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovq %xmm15,%r13 - vpextrq $1,%xmm15,%r15 - vpand %xmm14,%xmm11,%xmm11 - vpor %xmm11,%xmm8,%xmm8 - leaq -64(%rbp),%rbp - vmovdqu %xmm8,(%r15,%r13,1) - leaq 16(%r13),%r13 - cmpq %rsp,%rbp - jae L$ower_avx2 - - movq 552(%rsp),%r15 - leaq 64(%r13),%r13 - movq 560(%rsp),%rsi - addl %r14d,%eax - leaq 448(%rsp),%rsp - - addl 0(%r15),%eax - addl 4(%r15),%ebx - addl 8(%r15),%ecx - addl 12(%r15),%edx - addl 16(%r15),%r8d - addl 20(%r15),%r9d - addl 24(%r15),%r10d - leaq (%rsi,%r13,1),%r12 - addl 28(%r15),%r11d - - cmpq 64+16(%rsp),%r13 - - movl %eax,0(%r15) - cmoveq %rsp,%r12 - movl %ebx,4(%r15) - movl %ecx,8(%r15) - movl %edx,12(%r15) - movl %r8d,16(%r15) - movl %r9d,20(%r15) - movl %r10d,24(%r15) - movl %r11d,28(%r15) - - jbe L$oop_avx2 - leaq (%rsp),%rbp - -L$done_avx2: - leaq (%rbp),%rsp - movq 64+32(%rsp),%r8 - movq 64+56(%rsp),%rsi - vmovdqu %xmm8,(%r8) - vzeroall - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 - - -.p2align 5 -aesni_cbc_sha256_enc_shaext: - movq 8(%rsp),%r10 - leaq K256+128(%rip),%rax - movdqu (%r9),%xmm1 - movdqu 16(%r9),%xmm2 - movdqa 512-128(%rax),%xmm3 - - movl 240(%rcx),%r11d - subq %rdi,%rsi - movups (%rcx),%xmm15 - movups 16(%rcx),%xmm4 - leaq 112(%rcx),%rcx - - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - movdqu (%r10),%xmm10 - movdqu 16(%r10),%xmm11 - movdqu 32(%r10),%xmm12 -.byte 102,68,15,56,0,211 - movdqu 48(%r10),%xmm13 - - movdqa 0-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 102,68,15,56,0,219 - movdqa %xmm2,%xmm9 - movdqa %xmm1,%xmm8 - movups 0(%rdi),%xmm14 - xorps %xmm15,%xmm14 - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 32-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 102,68,15,56,0,227 - leaq 64(%r10),%r10 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 64-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 102,68,15,56,0,235 -.byte 69,15,56,204,211 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 96-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 -.byte 15,56,203,202 - movdqa 128-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - cmpl $11,%r11d - jb L$aesenclast1 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je L$aesenclast1 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -L$aesenclast1: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 16(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,0(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 160-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 192-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 224-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 256-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb L$aesenclast2 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je L$aesenclast2 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -L$aesenclast2: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,202 - movups 32(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,16(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movdqa 288-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 320-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 -.byte 69,15,56,204,211 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm13,%xmm3 -.byte 102,65,15,58,15,220,4 - paddd %xmm3,%xmm10 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 352-128(%rax),%xmm0 - paddd %xmm13,%xmm0 -.byte 69,15,56,205,213 -.byte 69,15,56,204,220 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,65,15,58,15,221,4 - paddd %xmm3,%xmm11 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 384-128(%rax),%xmm0 - paddd %xmm10,%xmm0 -.byte 69,15,56,205,218 -.byte 69,15,56,204,229 - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm11,%xmm3 -.byte 102,65,15,58,15,218,4 - paddd %xmm3,%xmm12 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - movdqa 416-128(%rax),%xmm0 - paddd %xmm11,%xmm0 -.byte 69,15,56,205,227 -.byte 69,15,56,204,234 - cmpl $11,%r11d - jb L$aesenclast3 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je L$aesenclast3 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -L$aesenclast3: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm12,%xmm3 -.byte 102,65,15,58,15,219,4 - paddd %xmm3,%xmm13 - movups 48(%rdi),%xmm14 - xorps %xmm15,%xmm14 - movups %xmm6,32(%rsi,%rdi,1) - xorps %xmm14,%xmm6 - movups -80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups -64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 448-128(%rax),%xmm0 - paddd %xmm12,%xmm0 -.byte 69,15,56,205,236 - movdqa %xmm7,%xmm3 - movups -48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups -32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,202 - - movdqa 480-128(%rax),%xmm0 - paddd %xmm13,%xmm0 - movups -16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - movups 0(%rcx),%xmm4 - aesenc %xmm5,%xmm6 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movups 16(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -.byte 15,56,203,202 - - movups 32(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 48(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - cmpl $11,%r11d - jb L$aesenclast4 - movups 64(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 80(%rcx),%xmm5 - aesenc %xmm4,%xmm6 - je L$aesenclast4 - movups 96(%rcx),%xmm4 - aesenc %xmm5,%xmm6 - movups 112(%rcx),%xmm5 - aesenc %xmm4,%xmm6 -L$aesenclast4: - aesenclast %xmm5,%xmm6 - movups 16-112(%rcx),%xmm4 - nop - - paddd %xmm9,%xmm2 - paddd %xmm8,%xmm1 - - decq %rdx - movups %xmm6,48(%rsi,%rdi,1) - leaq 64(%rdi),%rdi - jnz L$oop_shaext - - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm3 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,211,8 - - movups %xmm6,(%r8) - movdqu %xmm1,(%r9) - movdqu %xmm2,16(%r9) - .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s index 1819757f0b4f4f..02f7f562ea827d 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-avx2.s @@ -1,1632 +1,24 @@ .text -.globl _rsaz_1024_sqr_avx2 - -.p2align 6 -_rsaz_1024_sqr_avx2: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - movq %rax,%rbp - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz L$sqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -L$sqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vpbroadcastq L$and_mask(%rip),%ymm15 - jmp L$OOP_GRANDE_SQR_1024 - -.p2align 5 -L$OOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp L$sqr_entry_1024 -.p2align 5 -L$OOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -L$sqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz L$OOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $147,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $536870911,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $536870911,%eax - - movl $9,%r14d - jmp L$OOP_REDUCE_1024 - -.p2align 5 -L$OOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $536870911,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $536870911,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $536870911,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $536870911,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz L$OOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $147,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $147,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne L$OOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$sqr_1024_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_1024_mul_avx2 - -.p2align 6 -_rsaz_1024_mul_avx2: - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rax,%rbp - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz L$mul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -L$mul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu L$and_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp L$oop_mul_1024 - -.p2align 5 -L$oop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $536870911,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $147,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm9 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm9,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - imull %r8d,%eax - andl $536870911,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $536870911,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $536870911,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz L$oop_mul_1024 - vpermq $0,%ymm15,%ymm15 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $147,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $147,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $147,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $147,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $147,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $147,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $147,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$mul_1024_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_1024_red2norm_avx2 +.globl _rsaz_avx2_eligible -.p2align 5 -_rsaz_1024_red2norm_avx2: - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax +_rsaz_avx2_eligible: + xorl %eax,%eax .byte 0xf3,0xc3 +.globl _rsaz_1024_sqr_avx2 +.globl _rsaz_1024_mul_avx2 .globl _rsaz_1024_norm2red_avx2 - -.p2align 5 -_rsaz_1024_norm2red_avx2: - subq $-128,%rdi - movq (%rsi),%r8 - movl $536870911,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 - +.globl _rsaz_1024_red2norm_avx2 .globl _rsaz_1024_scatter5_avx2 - -.p2align 5 -_rsaz_1024_scatter5_avx2: - vzeroupper - vmovdqu L$scatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp L$oop_scatter_1024 - -.p2align 5 -L$oop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz L$oop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 - - .globl _rsaz_1024_gather5_avx2 -.p2align 5 +_rsaz_1024_sqr_avx2: +_rsaz_1024_mul_avx2: +_rsaz_1024_norm2red_avx2: +_rsaz_1024_red2norm_avx2: +_rsaz_1024_scatter5_avx2: _rsaz_1024_gather5_avx2: - leaq L$gather_table(%rip),%r11 - movl %edx,%eax - andl $3,%edx - shrl $2,%eax - shll $4,%edx - - vmovdqu -32(%r11),%ymm7 - vpbroadcastb 8(%r11,%rax,1),%xmm8 - vpbroadcastb 7(%r11,%rax,1),%xmm9 - vpbroadcastb 6(%r11,%rax,1),%xmm10 - vpbroadcastb 5(%r11,%rax,1),%xmm11 - vpbroadcastb 4(%r11,%rax,1),%xmm12 - vpbroadcastb 3(%r11,%rax,1),%xmm13 - vpbroadcastb 2(%r11,%rax,1),%xmm14 - vpbroadcastb 1(%r11,%rax,1),%xmm15 - - leaq 64(%rsi,%rdx,1),%rsi - movq $64,%r11 - movl $9,%eax - jmp L$oop_gather_1024 - -.p2align 5 -L$oop_gather_1024: - vpand -64(%rsi),%xmm8,%xmm0 - vpand (%rsi),%xmm9,%xmm1 - vpand 64(%rsi),%xmm10,%xmm2 - vpand (%rsi,%r11,2),%xmm11,%xmm3 - vpor %xmm0,%xmm1,%xmm1 - vpand 64(%rsi,%r11,2),%xmm12,%xmm4 - vpor %xmm2,%xmm3,%xmm3 - vpand (%rsi,%r11,4),%xmm13,%xmm5 - vpor %xmm1,%xmm3,%xmm3 - vpand 64(%rsi,%r11,4),%xmm14,%xmm6 - vpor %xmm4,%xmm5,%xmm5 - vpand -128(%rsi,%r11,8),%xmm15,%xmm2 - leaq (%rsi,%r11,8),%rsi - vpor %xmm3,%xmm5,%xmm5 - vpor %xmm2,%xmm6,%xmm6 - vpor %xmm5,%xmm6,%xmm6 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqu %ymm6,(%rdi) - leaq 32(%rdi),%rdi - decl %eax - jnz L$oop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper +.byte 0x0f,0x0b .byte 0xf3,0xc3 - - -.globl _rsaz_avx2_eligible - -.p2align 5 -_rsaz_avx2_eligible: - movl _OPENSSL_ia32cap_P+8(%rip),%eax - movl $524544,%ecx - movl $0,%edx - andl %eax,%ecx - cmpl $524544,%ecx - cmovel %edx,%eax - andl $32,%eax - shrl $5,%eax - .byte 0xf3,0xc3 - - -.p2align 6 -L$and_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 -L$scatter_permd: -.long 0,2,4,6,7,7,7,7 -L$gather_permd: -.long 0,7,1,7,2,7,3,7 -L$gather_table: -.byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 -.p2align 6 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s index 23c540d3acb221..b92f098e73c214 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/rsaz-x86_64.s @@ -19,10 +19,6 @@ L$sqr_body: movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) - movl $524544,%r11d - andl _OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je L$oop_sqrx jmp L$oop_sqr .p2align 5 @@ -386,276 +382,6 @@ L$oop_sqr: decl %r8d jnz L$oop_sqr - jmp L$sqr_tail - -.p2align 5 -L$oop_sqrx: - movl %r8d,128+8(%rsp) -.byte 102,72,15,110,199 -.byte 102,72,15,110,205 - - mulxq %rax,%r8,%r9 - - mulxq 16(%rsi),%rcx,%r10 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r11 - adcxq %rcx,%r9 - - mulxq 32(%rsi),%rcx,%r12 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rcx,%r11 - -.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r12 - adcxq %rcx,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adcxq %rbp,%r15 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shlq $1,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rdx,%r8 - movq 8(%rsi),%rdx - adcxq %rbp,%r9 - - movq %rax,(%rsp) - movq %r8,8(%rsp) - - - mulxq 16(%rsi),%rax,%rbx - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %r8,%r12 - - mulxq 32(%rsi),%rax,%rbx - adoxq %rax,%r12 - adcxq %rbx,%r13 - - mulxq 40(%rsi),%rdi,%r8 - adoxq %rdi,%r13 - adcxq %r8,%r14 - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - -.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 - adoxq %rdi,%r15 - adcxq %rbp,%r8 - adoxq %rbp,%r8 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rcx - movq 16(%rsi),%rdx - adcxq %rax,%r9 - adcxq %rcx,%r10 - adcxq %rbp,%r11 - - movq %r9,16(%rsp) -.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 - - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 - adoxq %rdi,%r12 - adcxq %r9,%r13 - - mulxq 32(%rsi),%rax,%rcx - adoxq %rax,%r13 - adcxq %rcx,%r14 - - mulxq 40(%rsi),%rdi,%r9 - adoxq %rdi,%r14 - adcxq %r9,%r15 - -.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 - adoxq %rax,%r15 - adcxq %rcx,%r8 - -.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %rbp,%r9 - - movq %r13,%rcx - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 24(%rsi),%rdx - adcxq %rbp,%r13 - - movq %r11,32(%rsp) -.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 - adoxq %rax,%r14 - adcxq %rbx,%r15 - - mulxq 40(%rsi),%rdi,%r10 - adoxq %rdi,%r15 - adcxq %r10,%r8 - - mulxq 48(%rsi),%rax,%rbx - adoxq %rax,%r8 - adcxq %rbx,%r9 - - mulxq 56(%rsi),%rdi,%r10 - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %rbp,%r10 - -.byte 0x66 - movq %r15,%rbx - shldq $1,%r14,%r15 - shldq $1,%rcx,%r14 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r13 - adcxq %rdx,%r14 - movq 32(%rsi),%rdx - adcxq %rbp,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - - -.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 - adoxq %rdi,%r8 - adcxq %r11,%r9 - - mulxq 48(%rsi),%rax,%rcx - adoxq %rax,%r9 - adcxq %rcx,%r10 - - mulxq 56(%rsi),%rdi,%r11 - adoxq %rdi,%r10 - adcxq %rbp,%r11 - adoxq %rbp,%r11 - - movq %r9,%rcx - shldq $1,%r8,%r9 - shldq $1,%rbx,%r8 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r15 - adcxq %rdx,%r8 - movq 40(%rsi),%rdx - adcxq %rbp,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - - -.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 - adoxq %rax,%r10 - adcxq %rbx,%r11 - -.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 - adoxq %rdi,%r11 - adcxq %rbp,%r12 - adoxq %rbp,%r12 - - movq %r11,%rbx - shldq $1,%r10,%r11 - shldq $1,%rcx,%r10 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r9 - adcxq %rdx,%r10 - movq 48(%rsi),%rdx - adcxq %rbp,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - -.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 - adoxq %rax,%r12 - adoxq %rbp,%r13 - - xorq %r14,%r14 - shldq $1,%r13,%r14 - shldq $1,%r12,%r13 - shldq $1,%rbx,%r12 - - xorl %ebp,%ebp - mulxq %rdx,%rax,%rdx - adcxq %rax,%r11 - adcxq %rdx,%r12 - movq 56(%rsi),%rdx - adcxq %rbp,%r13 - -.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 -.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 - - - mulxq %rdx,%rax,%rdx - adoxq %rax,%r13 - adoxq %rbp,%rdx - -.byte 0x66 - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz L$oop_sqrx - -L$sqr_tail: leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -684,10 +410,6 @@ L$mul_body: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) - movl $524544,%r11d - andl _OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je L$mulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul @@ -705,29 +427,6 @@ L$mul_body: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp L$mul_tail - -.p2align 5 -L$mulx: - movq %rdx,%rbp - movq (%rdx),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex -L$mul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -762,52 +461,94 @@ _rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp L$mul_gather4_body: - movl $524544,%r11d - andl _OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je L$mulx_gather - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -820,14 +561,12 @@ L$mul_gather4_body: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -839,6 +578,35 @@ L$mul_gather4_body: .p2align 5 L$oop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -847,7 +615,6 @@ L$oop_mul_gather: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -856,7 +623,6 @@ L$oop_mul_gather: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -865,7 +631,6 @@ L$oop_mul_gather: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -874,7 +639,6 @@ L$oop_mul_gather: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -899,7 +663,6 @@ L$oop_mul_gather: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -907,7 +670,6 @@ L$oop_mul_gather: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -922,8 +684,8 @@ L$oop_mul_gather: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -935,126 +697,6 @@ L$oop_mul_gather: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp L$mul_gather_tail - -.p2align 5 -L$mulx_gather: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - leaq 128(%rdx,%r9,4),%rbp - movl (%rdx,%r9,4),%edx -.byte 102,72,15,110,201 - movq %r8,128(%rsp) - - shlq $32,%rax - orq %rax,%rdx - mulxq (%rsi),%rbx,%r8 - movq %rbx,(%rsp) - xorl %edi,%edi - - mulxq 8(%rsi),%rax,%r9 - movd (%rbp),%xmm4 - - mulxq 16(%rsi),%rbx,%r10 - movd 64(%rbp),%xmm5 - adcxq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - pslldq $4,%xmm5 - adcxq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - por %xmm5,%xmm4 - adcxq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - leaq 128(%rbp),%rbp - adcxq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 -.byte 102,72,15,126,226 - adcxq %rbx,%r13 - adcxq %rax,%r14 - movq %r8,%rbx - adcxq %rdi,%r15 - - movq $-7,%rcx - jmp L$oop_mulx_gather - -.p2align 5 -L$oop_mulx_gather: - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 -.byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - movd 64(%rbp),%xmm5 - leaq 128(%rbp),%rbp - adcxq %rax,%r9 - adoxq %r11,%r10 - -.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 - pslldq $4,%xmm5 - por %xmm5,%xmm4 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 -.byte 102,72,15,126,226 - movq %rbx,64(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - movq %r8,%rbx - adcxq %rdi,%r15 - - incq %rcx - jnz L$oop_mulx_gather - - movq %r8,64(%rsp) - movq %r9,64+8(%rsp) - movq %r10,64+16(%rsp) - movq %r11,64+24(%rsp) - movq %r12,64+32(%rsp) - movq %r13,64+40(%rsp) - movq %r14,64+48(%rsp) - movq %r15,64+56(%rsp) - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -L$mul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1092,17 +734,13 @@ _rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp L$mul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 movq %rcx,128(%rsp) movq %rdi,%rbp - movl $524544,%r11d - andl _OPENSSL_ia32cap_P+8(%rip),%r11d - cmpl $524544,%r11d - je L$mulx_scatter movq (%rdi),%rbx call __rsaz_512_mul @@ -1119,29 +757,6 @@ L$mul_scatter4_body: movq 56(%rsp),%r15 call __rsaz_512_reduce - jmp L$mul_scatter_tail - -.p2align 5 -L$mulx_scatter: - movq (%rdi),%rdx - call __rsaz_512_mulx - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq 128(%rsp),%rdx - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reducex - -L$mul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 @@ -1155,30 +770,14 @@ L$mul_scatter_tail: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1204,7 +803,6 @@ _rsaz_512_mul_by_one: subq $128+24,%rsp L$mul_by_one_body: - movl _OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) @@ -1225,16 +823,7 @@ L$mul_by_one_body: movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) - andl $524544,%eax - cmpl $524544,%eax - je L$by_one_callx call __rsaz_512_reduce - jmp L$by_one_tail -.p2align 5 -L$by_one_callx: - movq 128(%rsp),%rdx - call __rsaz_512_reducex -L$by_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) @@ -1339,62 +928,6 @@ L$reduction_loop: .byte 0xf3,0xc3 -.p2align 5 -__rsaz_512_reducex: - - imulq %r8,%rdx - xorq %rsi,%rsi - movl $8,%ecx - jmp L$reduction_loopx - -.p2align 5 -L$reduction_loopx: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 128+8(%rsp),%rbx,%rdx - movq %rax,%rdx - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - - decl %ecx - jne L$reduction_loopx - - .byte 0xf3,0xc3 - - .p2align 5 __rsaz_512_subtract: movq %r8,(%rdi) @@ -1593,140 +1126,18 @@ L$oop_mul: .byte 0xf3,0xc3 - -.p2align 5 -__rsaz_512_mulx: - mulxq (%rsi),%rbx,%r8 - movq $-6,%rcx - - mulxq 8(%rsi),%rax,%r9 - movq %rbx,8(%rsp) - - mulxq 16(%rsi),%rbx,%r10 - adcq %rax,%r8 - - mulxq 24(%rsi),%rax,%r11 - adcq %rbx,%r9 - - mulxq 32(%rsi),%rbx,%r12 - adcq %rax,%r10 - - mulxq 40(%rsi),%rax,%r13 - adcq %rbx,%r11 - - mulxq 48(%rsi),%rbx,%r14 - adcq %rax,%r12 - - mulxq 56(%rsi),%rax,%r15 - movq 8(%rbp),%rdx - adcq %rbx,%r13 - adcq %rax,%r14 - adcq $0,%r15 - - xorq %rdi,%rdi - jmp L$oop_mulx - -.p2align 5 -L$oop_mulx: - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rsi),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rsi),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rsi),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rsi),%rax,%r15 - movq 64(%rbp,%rcx,8),%rdx - movq %rbx,8+64-8(%rsp,%rcx,8) - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - incq %rcx - jnz L$oop_mulx - - movq %r8,%rbx - mulxq (%rsi),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - -.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 - adcxq %rax,%r8 - adoxq %r10,%r9 - -.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rsi),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - - mulxq 32(%rsi),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rsi),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - -.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 - adcxq %rax,%r14 - adoxq %rdi,%r15 - adcxq %rdi,%r15 - - movq %rbx,8+64-8(%rsp) - movq %r8,8+64(%rsp) - movq %r9,8+64+8(%rsp) - movq %r10,8+64+16(%rsp) - movq %r11,8+64+24(%rsp) - movq %r12,8+64+32(%rsp) - movq %r13,8+64+40(%rsp) - movq %r14,8+64+48(%rsp) - movq %r15,8+64+56(%rsp) - - .byte 0xf3,0xc3 - .globl _rsaz_512_scatter4 .p2align 4 _rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp L$oop_scatter .p2align 4 L$oop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz L$oop_scatter @@ -1737,18 +1148,72 @@ L$oop_scatter: .p2align 4 _rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp L$oop_gather .p2align 4 L$oop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz L$oop_gather .byte 0xf3,0xc3 +L$SEH_end_rsaz_512_gather4: + + +.p2align 6 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s index 040c324c49a753..c0f0b4bd6878b8 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-gf2m.s @@ -242,7 +242,7 @@ L$body_mul_2x2: movq %rcx,56(%rsp) movq %r8,64(%rsp) - movq $15,%r8 + movq $0xf,%r8 movq %rsi,%rax movq %rcx,%rbp call _mul_1x1 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s index 03b9c7d949de52..9b49555a4d2132 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont.s @@ -10,7 +10,6 @@ _bn_mul_mont: jnz L$mul_enter cmpl $8,%r9d jb L$mul_enter - movl _OPENSSL_ia32cap_P+8(%rip),%r11d cmpq %rsi,%rdx jne L$mul4x_enter testl $7,%r9d @@ -216,9 +215,6 @@ L$mul_epilogue: .p2align 4 bn_mul4x_mont: L$mul4x_enter: - andl $524544,%r11d - cmpl $524544,%r11d - je L$mulx4x_enter pushq %rbx pushq %rbp pushq %r12 @@ -616,7 +612,6 @@ L$mul4x_epilogue: - .p2align 5 bn_sqr8x_mont: L$sqr8x_enter: @@ -638,20 +633,20 @@ L$sqr8x_enter: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -661,385 +656,81 @@ L$sqr8x_sp_done: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) L$sqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl _OPENSSL_ia32cap_P+8(%rip),%eax - jmp L$sqr8x_copy_n - -.p2align 5 -L$sqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz L$sqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 - andl $524544,%eax - cmpl $524544,%eax - jne L$sqr8x_nox - - call _bn_sqrx8x_internal - - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp L$sqr8x_zero - -.p2align 5 -L$sqr8x_nox: call _bn_sqr8x_internal - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp L$sqr8x_zero - -.p2align 5 -L$sqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz L$sqr8x_zero - - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$sqr8x_epilogue: - .byte 0xf3,0xc3 - - -.p2align 5 -bn_mulx4x_mont: -L$mulx4x_enter: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - shll $3,%r9d -.byte 0x67 - xorq %r10,%r10 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rsp - leaq (%rdx,%r9,1),%r10 - andq $-128,%rsp - - - - - - - - - - - - - movq %r9,0(%rsp) - shrq $5,%r9 - movq %r10,16(%rsp) - subq $1,%r9 - movq %r8,24(%rsp) - movq %rdi,32(%rsp) - movq %rax,40(%rsp) - movq %r9,48(%rsp) - jmp L$mulx4x_body - -.p2align 5 -L$mulx4x_body: - leaq 8(%rdx),%rdi - movq (%rdx),%rdx - leaq 64+32(%rsp),%rbx - movq %rdx,%r9 - - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r14 - addq %rax,%r11 - movq %rdi,8(%rsp) - mulxq 16(%rsi),%r12,%r13 - adcq %r14,%r12 - adcq $0,%r13 - - movq %r8,%rdi - imulq 24(%rsp),%r8 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%rdi - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 -.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - movq 48(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - - jmp L$mulx4x_1st - -.p2align 5 -L$mulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - decq %rdi - jnz L$mulx4x_1st - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - addq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - jmp L$mulx4x_outer -.p2align 5 -L$mulx4x_outer: - movq (%rdi),%rdx - leaq 8(%rdi),%rdi - subq %rax,%rsi - movq %r15,(%rbx) - leaq 64+32(%rsp),%rbx - subq %rax,%rcx - - mulxq 0(%rsi),%r8,%r11 - xorl %ebp,%ebp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - adoxq %rbp,%r12 - adcxq %rbp,%r13 - - movq %rdi,8(%rsp) -.byte 0x67 - movq %r8,%r15 - imulq 24(%rsp),%r8 - xorl %ebp,%ebp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - adoxq -16(%rbx),%r12 - adcxq %rax,%r13 - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - adoxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx movq %r9,%rdx - movq %r11,-24(%rbx) - leaq 32(%rcx),%rcx - adcxq %rax,%r12 - adoxq %rbp,%r15 - movq 48(%rsp),%rdi - movq %r12,-16(%rbx) - - jmp L$mulx4x_inner +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub .p2align 5 -L$mulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi +L$sqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-32(%rbx) - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_inner + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz L$sqr8x_sub - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - subq 0(%rbx),%rbp - adcq %r15,%r14 - movq -8(%rcx),%r8 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - - cmpq 16(%rsp),%rdi - jne L$mulx4x_outer - - subq %r14,%r8 - sbbq %r8,%r8 - orq %r8,%r15 - - negq %rax - xorq %rdx,%rdx - movq 32(%rsp),%rdi - leaq 64(%rsp),%rbx + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - movq 0(%rcx,%rax,1),%r8 - movq 8(%rcx,%rax,1),%r9 - negq %r8 - jmp L$mulx4x_sub_entry + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy .p2align 5 -L$mulx4x_sub: - movq 0(%rcx,%rax,1),%r8 - movq 8(%rcx,%rax,1),%r9 - notq %r8 -L$mulx4x_sub_entry: - movq 16(%rcx,%rax,1),%r10 - notq %r9 - andq %r15,%r8 - movq 24(%rcx,%rax,1),%r11 - notq %r10 - andq %r15,%r9 - notq %r11 - andq %r15,%r10 - andq %r15,%r11 - - negq %rdx - adcq 0(%rbx),%r8 - adcq 8(%rbx),%r9 - movdqa %xmm0,(%rbx) - adcq 16(%rbx),%r10 - adcq 24(%rbx),%r11 - movdqa %xmm0,16(%rbx) +L$sqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx - sbbq %rdx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz L$sqr8x_cond_copy - addq $32,%rax - jnz L$mulx4x_sub - - movq 40(%rsp),%rsi movq $1,%rax movq -48(%rsi),%r15 movq -40(%rsi),%r14 @@ -1048,7 +739,7 @@ L$mulx4x_sub_entry: movq -16(%rsi),%rbp movq -8(%rsi),%rbx leaq (%rsi),%rsp -L$mulx4x_epilogue: +L$sqr8x_epilogue: .byte 0xf3,0xc3 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s index 99a16f85032d1e..c9731e162da51e 100644 --- a/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm/x64-macosx-gas/bn/x86_64-mont5.s @@ -8,53 +8,157 @@ _bn_mul_mont_gather5: testl $7,%r9d jnz L$mul_enter - movl _OPENSSL_ia32cap_P+8(%rip),%r11d jmp L$mul4x_enter .p2align 4 L$mul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq L$inc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) L$mul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -63,29 +167,14 @@ L$mul_body: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -118,14 +207,12 @@ L$1st_enter: cmpq %r9,%r15 jne L$1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -139,33 +226,78 @@ L$1st_enter: jmp L$outer .p2align 4 L$outer: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -201,15 +333,12 @@ L$inner_enter: cmpq %r9,%r15 jne L$inner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -256,6 +385,7 @@ L$copy: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -270,9 +400,6 @@ L$mul_epilogue: .p2align 5 bn_mul4x_mont_gather5: L$mul4x_enter: - andl $524544,%r11d - cmpl $524544,%r11d - je L$mulx4x_enter .byte 0x67 movq %rsp,%rax pushq %rbx @@ -281,10 +408,10 @@ L$mul4x_enter: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -294,19 +421,21 @@ L$mul4x_enter: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -322,6 +451,7 @@ L$mul4x_body: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -337,47 +467,141 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq L$inc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 -.byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 -.byte 0x67 - por %xmm2,%xmm0 - movq -32(%r14),%xmm2 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -391,26 +615,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -419,7 +627,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -429,7 +637,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -439,7 +647,7 @@ mul4x_internal: L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -455,7 +663,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -485,7 +693,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -494,7 +702,7 @@ L$1st4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -504,7 +712,7 @@ L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -520,7 +728,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -533,8 +741,7 @@ L$1st4x: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -545,6 +752,63 @@ L$1st4x: .p2align 5 L$outer4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -552,25 +816,11 @@ L$outer4x: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -580,7 +830,7 @@ L$outer4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -592,7 +842,7 @@ L$outer4x: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp L$inner4x @@ -601,7 +851,7 @@ L$outer4x: L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -619,7 +869,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -653,7 +903,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -664,7 +914,7 @@ L$inner4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -674,7 +924,7 @@ L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -693,7 +943,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -708,9 +958,8 @@ L$inner4x: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -721,25 +970,28 @@ L$inner4x: cmpq 16+8(%rsp),%r12 jb L$outer4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp L$sqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry .globl _bn_power5 .p2align 5 _bn_power5: - movl _OPENSSL_ia32cap_P+8(%rip),%r11d - andl $524544,%r11d - cmpl $524544,%r11d - je L$powerx5_enter movq %rsp,%rax pushq %rbx pushq %rbp @@ -747,9 +999,9 @@ _bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -759,19 +1011,20 @@ _bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -799,10 +1052,15 @@ L$power5_body: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1346,9 +1604,9 @@ L$sqr4x_shift_n_add: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1381,14 +1639,14 @@ L$8x_reduction_loop: .p2align 5 L$8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1397,7 +1655,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1406,7 +1664,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1415,7 +1673,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1423,7 +1681,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1431,7 +1689,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1449,7 +1707,7 @@ L$8x_reduce: decl %ecx jnz L$8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1475,14 +1733,14 @@ L$8x_reduce: L$8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1491,7 +1749,7 @@ L$8x_tail: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1499,7 +1757,7 @@ L$8x_tail: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1507,7 +1765,7 @@ L$8x_tail: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1515,7 +1773,7 @@ L$8x_tail: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1523,7 +1781,7 @@ L$8x_tail: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1541,7 +1799,7 @@ L$8x_tail: decl %ecx jnz L$8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae L$8x_tail_done @@ -1587,7 +1845,7 @@ L$8x_no_tail: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1605,40 +1863,58 @@ L$8x_no_tail: cmpq %rdx,%rdi jb L$8x_reduction_loop + .byte 0xf3,0xc3 - subq %r15,%rcx + +.p2align 5 +__bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi movq %r9,%rcx - orq %rsi,%rax .byte 102,72,15,126,207 - xorq $1,%rax + negq %rax .byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp sarq $3+2,%rcx - jmp L$sqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry -.p2align 5 +.p2align 4 L$sqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz L$sqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 @@ -1664,10 +1940,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1677,19 +1952,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$from_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$from_sp_done .p2align 5 L$from_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1740,22 +2016,8 @@ L$mul_by_1: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - movl _OPENSSL_ia32cap_P+8(%rip),%r11d - andl $524544,%r11d - cmpl $524544,%r11d - jne L$from_mont_nox - - leaq (%rax,%r9,1),%rdi - call sqrx8x_reduction - - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - movq 40(%rsp),%rsi - jmp L$from_mont_zero - -.p2align 5 -L$from_mont_nox: - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1783,1197 +2045,208 @@ L$from_mont_zero: L$from_epilogue: .byte 0xf3,0xc3 +.globl _bn_get_bits5 -.p2align 5 -bn_mulx4x_mont_gather5: -L$mulx4x_enter: -.byte 0x67 - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -.byte 0x67 - movl %r9d,%r10d - shll $3,%r9d - shll $3+2,%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$mulx4xsp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp - jmp L$mulx4xsp_done - -.p2align 5 -L$mulx4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rsp -L$mulx4xsp_done: - andq $-64,%rsp - - - - - - - - - - +.p2align 4 +_bn_get_bits5: + leaq 0(%rdi),%r10 + leaq 1(%rdi),%r11 + movl %esi,%ecx + shrl $4,%esi + andl $15,%ecx + leal -8(%rcx),%eax + cmpl $11,%ecx + cmovaq %r11,%r10 + cmoval %eax,%ecx + movzwl (%r10,%rsi,2),%eax + shrl %cl,%eax + andl $31,%eax + .byte 0xf3,0xc3 - movq %r8,32(%rsp) - movq %rax,40(%rsp) -L$mulx4x_body: - call mulx4x_internal +.globl _bn_scatter5 - movq 40(%rsp),%rsi - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$mulx4x_epilogue: +.p2align 4 +_bn_scatter5: + cmpl $0,%esi + jz L$scatter_epilogue + leaq (%rdx,%rcx,8),%rdx +L$scatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz L$scatter +L$scatter_epilogue: .byte 0xf3,0xc3 +.globl _bn_gather5 .p2align 5 -mulx4x_internal: -.byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 -.byte 0x67 - negq %r9 - shlq $5,%r9 - leaq 256(%rdx,%r9,1),%r13 - shrq $5+5,%r9 - movl 8(%rax),%r10d - subq $1,%r9 - movq %r13,16+8(%rsp) - movq %r9,24+8(%rsp) - movq %rdi,56+8(%rsp) - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%rdi - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%rdi),%xmm0 - leaq 256(%rdi),%rbx - movq -32(%rdi),%xmm1 - pand %xmm4,%xmm0 - movq 32(%rdi),%xmm2 - pand %xmm5,%xmm1 - movq 96(%rdi),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq -96(%rbx),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq -32(%rbx),%xmm2 - por %xmm3,%xmm0 -.byte 0x67,0x67 - pand %xmm4,%xmm1 - movq 32(%rbx),%xmm3 - -.byte 102,72,15,126,194 - movq 96(%rbx),%xmm0 - leaq 512(%rdi),%rdi - pand %xmm5,%xmm2 -.byte 0x67,0x67 - pand %xmm6,%xmm3 - - - - - - - - leaq 64+32+8(%rsp,%r11,8),%rbx - - movq %rdx,%r9 - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r12 - addq %rax,%r11 - mulxq 16(%rsi),%rax,%r13 - adcq %rax,%r12 - adcq $0,%r13 - mulxq 24(%rsi),%rax,%r14 +_bn_gather5: +L$SEH_begin_bn_gather5: + +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq L$inc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp L$gather - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - xorq %rbp,%rbp - movq %r8,%rdx +.p2align 5 +L$gather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz L$gather - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - movq %rdi,8+8(%rsp) - por %xmm1,%xmm0 - -.byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 16(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 32(%rcx),%rax,%r12 - movq 24+8(%rsp),%rdi -.byte 0x66 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 48(%rcx),%rax,%r15 -.byte 0x67,0x67 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 -.byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 - movq %r12,-16(%rbx) - - -.p2align 5 -L$mulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 16(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 32(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 64(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_1st - - movq 8(%rsp),%rax -.byte 102,72,15,126,194 - adcq %rbp,%r15 - leaq (%rsi,%rax,1),%rsi - addq %r15,%r14 - movq 8+8(%rsp),%rdi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - jmp L$mulx4x_outer - -.p2align 5 -L$mulx4x_outer: - movq %rbp,(%rbx) - leaq 32(%rbx,%rax,1),%rbx - mulxq 0(%rsi),%r8,%r11 - xorq %rbp,%rbp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - mulxq 24(%rsi),%rdx,%r14 - adoxq -16(%rbx),%r12 - adcxq %rdx,%r13 - leaq (%rcx,%rax,2),%rcx - leaq 32(%rsi),%rsi - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - adoxq %rbp,%r14 - -.byte 0x67 - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - - movq -96(%rdi),%xmm0 -.byte 0x67,0x67 - movq %r8,%rdx - movq -32(%rdi),%xmm1 -.byte 0x67 - pand %xmm4,%xmm0 - movq 32(%rdi),%xmm2 -.byte 0x67 - pand %xmm5,%xmm1 - movq 96(%rdi),%xmm3 - addq $256,%rdi -.byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - xorq %rbp,%rbp - movq %rdi,8+8(%rsp) - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 16(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 32(%rcx),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - por %xmm2,%xmm0 - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - por %xmm3,%xmm0 - adcxq %rax,%r12 - movq %r11,-24(%rbx) - adoxq %rbp,%r15 - movq %r12,-16(%rbx) - leaq 64(%rcx),%rcx - jmp L$mulx4x_inner - -.p2align 5 -L$mulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 16(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 32(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - movq %r11,-32(%rbx) - mulxq 48(%rcx),%rax,%r15 - movq %r9,%rdx - leaq 64(%rcx),%rcx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_inner - - movq 0+8(%rsp),%rax -.byte 102,72,15,126,194 - adcq %rbp,%r15 - subq 0(%rbx),%rdi - movq 8+8(%rsp),%rdi - movq 16+8(%rsp),%r10 - adcq %r15,%r14 - leaq (%rsi,%rax,1),%rsi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - - cmpq %r10,%rdi - jb L$mulx4x_outer - - movq -16(%rcx),%r10 - xorq %r15,%r15 - subq %r14,%r10 - adcq %r15,%r15 - orq %r15,%rbp - xorq $1,%rbp - leaq (%rbx,%rax,1),%rdi - leaq (%rcx,%rax,2),%rcx -.byte 0x67,0x67 - sarq $3+2,%rax - leaq (%rcx,%rbp,8),%rbp - movq 56+8(%rsp),%rdx - movq %rax,%rcx - jmp L$sqrx4x_sub - - -.p2align 5 -bn_powerx5: -L$powerx5_enter: -.byte 0x67 - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 -.byte 0x67 - movl %r9d,%r10d - shll $3,%r9d - shll $3+2,%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$pwrx_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp - jmp L$pwrx_sp_done - -.p2align 5 -L$pwrx_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rsp -L$pwrx_sp_done: - andq $-64,%rsp - movq %r9,%r10 - negq %r9 - - - - - - - - - - - - - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - movq %r8,32(%rsp) - movq %rax,40(%rsp) -L$powerx5_body: - - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - call __bn_sqrx8x_internal - - movq %r10,%r9 - movq %rsi,%rdi -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq 40(%rsp),%rax - - call mulx4x_internal - - movq 40(%rsp),%rsi - movq $1,%rax - movq -48(%rsi),%r15 - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$powerx5_epilogue: - .byte 0xf3,0xc3 - - -.globl _bn_sqrx8x_internal -.private_extern _bn_sqrx8x_internal - -.p2align 5 -_bn_sqrx8x_internal: -__bn_sqrx8x_internal: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 48+8(%rsp),%rdi - leaq (%rsi,%r9,1),%rbp - movq %r9,0+8(%rsp) - movq %rbp,8+8(%rsp) - jmp L$sqr8x_zero_start - -.p2align 5 -.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -L$sqrx8x_zero: -.byte 0x3e - movdqa %xmm0,0(%rdi) - movdqa %xmm0,16(%rdi) - movdqa %xmm0,32(%rdi) - movdqa %xmm0,48(%rdi) -L$sqr8x_zero_start: - movdqa %xmm0,64(%rdi) - movdqa %xmm0,80(%rdi) - movdqa %xmm0,96(%rdi) - movdqa %xmm0,112(%rdi) - leaq 128(%rdi),%rdi - subq $64,%r9 - jnz L$sqrx8x_zero - - movq 0(%rsi),%rdx - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - leaq 48+8(%rsp),%rdi - xorq %rbp,%rbp - jmp L$sqrx8x_outer_loop - -.p2align 5 -L$sqrx8x_outer_loop: - mulxq 8(%rsi),%r8,%rax - adcxq %r9,%r8 - adoxq %rax,%r10 - mulxq 16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 -.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcxq %r11,%r10 - adoxq %rax,%r12 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcxq %r12,%r11 - adoxq %rax,%r13 - mulxq 40(%rsi),%r12,%rax - adcxq %r13,%r12 - adoxq %rax,%r14 - mulxq 48(%rsi),%r13,%rax - adcxq %r14,%r13 - adoxq %r15,%rax - mulxq 56(%rsi),%r14,%r15 - movq 8(%rsi),%rdx - adcxq %rax,%r14 - adoxq %rbp,%r15 - adcq 64(%rdi),%r15 - movq %r8,8(%rdi) - movq %r9,16(%rdi) - sbbq %rcx,%rcx - xorq %rbp,%rbp - - - mulxq 16(%rsi),%r8,%rbx - mulxq 24(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 32(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %rbx,%r11 -.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcxq %r13,%r11 - adoxq %r14,%r12 -.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - movq 16(%rsi),%rdx - adcxq %rax,%r12 - adoxq %rbx,%r13 - adcxq %r15,%r13 - adoxq %rbp,%r14 - adcxq %rbp,%r14 - - movq %r8,24(%rdi) - movq %r9,32(%rdi) - - mulxq 24(%rsi),%r8,%rbx - mulxq 32(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 40(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %r13,%r11 -.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -.byte 0x3e - movq 24(%rsi),%rdx - adcxq %rbx,%r11 - adoxq %rax,%r12 - adcxq %r14,%r12 - movq %r8,40(%rdi) - movq %r9,48(%rdi) - mulxq 32(%rsi),%r8,%rax - adoxq %rbp,%r13 - adcxq %rbp,%r13 - - mulxq 40(%rsi),%r9,%rbx - adcxq %r10,%r8 - adoxq %rax,%r9 - mulxq 48(%rsi),%r10,%rax - adcxq %r11,%r9 - adoxq %r12,%r10 - mulxq 56(%rsi),%r11,%r12 - movq 32(%rsi),%rdx - movq 40(%rsi),%r14 - adcxq %rbx,%r10 - adoxq %rax,%r11 - movq 48(%rsi),%r15 - adcxq %r13,%r11 - adoxq %rbp,%r12 - adcxq %rbp,%r12 - - movq %r8,56(%rdi) - movq %r9,64(%rdi) - - mulxq %r14,%r9,%rax - movq 56(%rsi),%r8 - adcxq %r10,%r9 - mulxq %r15,%r10,%rbx - adoxq %rax,%r10 - adcxq %r11,%r10 - mulxq %r8,%r11,%rax - movq %r14,%rdx - adoxq %rbx,%r11 - adcxq %r12,%r11 - - adcxq %rbp,%rax - - mulxq %r15,%r14,%rbx - mulxq %r8,%r12,%r13 - movq %r15,%rdx - leaq 64(%rsi),%rsi - adcxq %r14,%r11 - adoxq %rbx,%r12 - adcxq %rax,%r12 - adoxq %rbp,%r13 - -.byte 0x67,0x67 - mulxq %r8,%r8,%r14 - adcxq %r8,%r13 - adcxq %rbp,%r14 - - cmpq 8+8(%rsp),%rsi - je L$sqrx8x_outer_break - - negq %rcx - movq $-8,%rcx - movq %rbp,%r15 - movq 64(%rdi),%r8 - adcxq 72(%rdi),%r9 - adcxq 80(%rdi),%r10 - adcxq 88(%rdi),%r11 - adcq 96(%rdi),%r12 - adcq 104(%rdi),%r13 - adcq 112(%rdi),%r14 - adcq 120(%rdi),%r15 - leaq (%rsi),%rbp - leaq 128(%rdi),%rdi - sbbq %rax,%rax - - movq -64(%rsi),%rdx - movq %rax,16+8(%rsp) - movq %rdi,24+8(%rsp) - - - xorl %eax,%eax - jmp L$sqrx8x_loop - -.p2align 5 -L$sqrx8x_loop: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - movq %rbx,(%rdi,%rcx,8) - movl $0,%ebx - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - movq 8(%rsi,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rbx,%r15 - adcxq %rbx,%r15 - -.byte 0x67 - incq %rcx - jnz L$sqrx8x_loop - - leaq 64(%rbp),%rbp - movq $-8,%rcx - cmpq 8+8(%rsp),%rbp - je L$sqrx8x_break - - subq 16+8(%rsp),%rbx -.byte 0x66 - movq -64(%rsi),%rdx - adcxq 0(%rdi),%r8 - adcxq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi -.byte 0x67 - sbbq %rax,%rax - xorl %ebx,%ebx - movq %rax,16+8(%rsp) - jmp L$sqrx8x_loop - -.p2align 5 -L$sqrx8x_break: - subq 16+8(%rsp),%r8 - movq 24+8(%rsp),%rcx - movq 0(%rsi),%rdx - xorl %ebp,%ebp - movq %r8,0(%rdi) - cmpq %rcx,%rdi - je L$sqrx8x_outer_loop - - movq %r9,8(%rdi) - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - movq 40(%rcx),%r13 - movq %r14,48(%rdi) - movq 48(%rcx),%r14 - movq %r15,56(%rdi) - movq 56(%rcx),%r15 - movq %rcx,%rdi - jmp L$sqrx8x_outer_loop - -.p2align 5 -L$sqrx8x_outer_break: - movq %r9,72(%rdi) -.byte 102,72,15,126,217 - movq %r10,80(%rdi) - movq %r11,88(%rdi) - movq %r12,96(%rdi) - movq %r13,104(%rdi) - movq %r14,112(%rdi) - leaq 48+8(%rsp),%rdi - movq (%rsi,%rcx,1),%rdx - - movq 8(%rdi),%r11 - xorq %r10,%r10 - movq 0+8(%rsp),%r9 - adoxq %r11,%r11 - movq 16(%rdi),%r12 - movq 24(%rdi),%r13 - - -.p2align 5 -L$sqrx4x_shift_n_add: - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax -.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 40(%rdi),%r11 - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - movq 16(%rsi,%rcx,1),%rdx - movq 48(%rdi),%r12 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 56(%rdi),%r13 - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax - movq 24(%rsi,%rcx,1),%rdx - leaq 32(%rcx),%rcx - movq 64(%rdi),%r10 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 72(%rdi),%r11 - movq %rax,32(%rdi) - movq %rbx,40(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - jrcxz L$sqrx4x_shift_n_add_break -.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 80(%rdi),%r12 - movq 88(%rdi),%r13 - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi - nop - jmp L$sqrx4x_shift_n_add - -.p2align 5 -L$sqrx4x_shift_n_add_break: - adcxq %r13,%rbx - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi -.byte 102,72,15,126,213 -sqrx8x_reduction: - xorl %eax,%eax - movq 32+8(%rsp),%rbx - movq 48+8(%rsp),%rdx - leaq -128(%rbp,%r9,2),%rcx - - movq %rcx,0+8(%rsp) - movq %rdi,8+8(%rsp) - - leaq 48+8(%rsp),%rdi - jmp L$sqrx8x_reduction_loop - -.p2align 5 -L$sqrx8x_reduction_loop: - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq %rdx,%r8 - imulq %rbx,%rdx - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,24+8(%rsp) - - leaq 64(%rdi),%rdi - xorq %rsi,%rsi - movq $-8,%rcx - jmp L$sqrx8x_reduce - -.p2align 5 -L$sqrx8x_reduce: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 16(%rbp),%rbx,%r9 - adcxq %rbx,%r8 - adoxq %r10,%r9 - - mulxq 32(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 48(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 32+8(%rsp),%rbx,%rdx - movq %rax,%rdx - movq %rax,64+48+8(%rsp,%rcx,8) - - mulxq 80(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 96(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 112(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - -.byte 0x67,0x67,0x67 - incq %rcx - jnz L$sqrx8x_reduce - - movq %rsi,%rax - cmpq 0+8(%rsp),%rbp - jae L$sqrx8x_no_tail - - movq 48+8(%rsp),%rdx - addq 0(%rdi),%r8 - leaq 128(%rbp),%rbp - movq $-8,%rcx - adcxq 8(%rdi),%r9 - adcxq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp L$sqrx8x_tail - -.p2align 5 -L$sqrx8x_tail: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 16(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 32(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 48(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 80(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 96(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 112(%rbp),%rax,%r15 - movq 72+48+8(%rsp,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - movq %rbx,(%rdi,%rcx,8) - movq %r8,%rbx - adcxq %rsi,%r15 - - incq %rcx - jnz L$sqrx8x_tail - - cmpq 0+8(%rsp),%rbp - jae L$sqrx8x_tail_done - - subq 16+8(%rsp),%rsi - movq 48+8(%rsp),%rdx - leaq 128(%rbp),%rbp - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - subq $8,%rcx - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp L$sqrx8x_tail - -.p2align 5 -L$sqrx8x_tail_done: - addq 24+8(%rsp),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - - movq %rsi,%rax - - subq 16+8(%rsp),%rsi -L$sqrx8x_no_tail: - adcq 0(%rdi),%r8 -.byte 102,72,15,126,217 - adcq 8(%rdi),%r9 - movq 112(%rbp),%rsi -.byte 102,72,15,126,213 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq %rax,%rax - - movq 32+8(%rsp),%rbx - movq 64(%rdi,%rcx,1),%rdx - - movq %r8,0(%rdi) - leaq 64(%rdi),%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 64(%rdi,%rcx,1),%rdi - cmpq 8+8(%rsp),%r8 - jb L$sqrx8x_reduction_loop - xorl %ebx,%ebx - subq %r15,%rsi - adcq %rbx,%rbx - movq %rcx,%r10 - orq %rbx,%rax - movq %rcx,%r9 - xorq $1,%rax - sarq $3+2,%rcx - - leaq (%rbp,%rax,8),%rbp -.byte 102,72,15,126,202 -.byte 102,72,15,126,206 - jmp L$sqrx4x_sub - -.p2align 5 -L$sqrx4x_sub: -.byte 0x66 - movq 0(%rdi),%r12 - movq 8(%rdi),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rdi),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rdi),%r15 - leaq 32(%rdi),%rdi - sbbq 32(%rbp),%r14 - movq %r12,0(%rdx) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp - movq %r13,8(%rdx) - movq %r14,16(%rdx) - movq %r15,24(%rdx) - leaq 32(%rdx),%rdx - - incq %rcx - jnz L$sqrx4x_sub - negq %r9 - - .byte 0xf3,0xc3 - -.globl _bn_get_bits5 - -.p2align 4 -_bn_get_bits5: - leaq 0(%rdi),%r10 - leaq 1(%rdi),%r11 - movl %esi,%ecx - shrl $4,%esi - andl $15,%ecx - leal -8(%rcx),%eax - cmpl $11,%ecx - cmovaq %r11,%r10 - cmoval %eax,%ecx - movzwl (%r10,%rsi,2),%eax - shrl %cl,%eax - andl $31,%eax - .byte 0xf3,0xc3 - - -.globl _bn_scatter5 - -.p2align 4 -_bn_scatter5: - cmpl $0,%esi - jz L$scatter_epilogue - leaq (%rdx,%rcx,8),%rdx -L$scatter: - movq (%rdi),%rax - leaq 8(%rdi),%rdi - movq %rax,(%rdx) - leaq 256(%rdx),%rdx - subl $1,%esi - jnz L$scatter -L$scatter_epilogue: - .byte 0xf3,0xc3 - - -.globl _bn_gather5 - -.p2align 4 -_bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq L$magic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 - jmp L$gather -.p2align 4 -L$gather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 - - movq %xmm0,(%rdi) - leaq 8(%rdi),%rdi - subl $1,%esi - jnz L$gather + leaq (%r10),%rsp .byte 0xf3,0xc3 L$SEH_end_bn_gather5: .p2align 6 -L$magic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s index f2eb8554e87067..30456b900fdffc 100644 --- a/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/ec/ecp_nistz256-x86_64.s @@ -332,8 +332,6 @@ _ecp_nistz256_neg: .p2align 5 _ecp_nistz256_to_mont: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx leaq L$RR(%rip),%rdx jmp L$mul_mont @@ -348,8 +346,6 @@ _ecp_nistz256_to_mont: .p2align 5 _ecp_nistz256_mul_mont: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx L$mul_mont: pushq %rbp pushq %rbx @@ -357,8 +353,6 @@ L$mul_mont: pushq %r13 pushq %r14 pushq %r15 - cmpl $524544,%ecx - je L$mul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 @@ -367,19 +361,6 @@ L$mul_mont: movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq - jmp L$mul_mont_done - -.p2align 5 -L$mul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx L$mul_mont_done: popq %r15 popq %r14 @@ -617,33 +598,18 @@ __ecp_nistz256_mul_montq: .p2align 5 _ecp_nistz256_sqr_mont: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 - cmpl $524544,%ecx - je L$sqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq - jmp L$sqr_mont_done - -.p2align 5 -L$sqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx L$sqr_mont_done: popq %r15 popq %r14 @@ -816,304 +782,6 @@ __ecp_nistz256_sqr_montq: .byte 0xf3,0xc3 -.p2align 5 -__ecp_nistz256_mul_montx: - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq L$poly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - - - - addq %rbp,%r9 - adcq %rcx,%r10 - - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq L$poly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_sqr_montx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq %r8,%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq L$poly+24(%rip),%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %rbp,%rcx,%r8 - movq %r9,%rdx - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %rbp,%rcx,%r9 - movq %r10,%rdx - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %rbp,%rcx,%r10 - movq %r11,%rdx - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %rbp,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - adcq %r8,%r12 - movq L$poly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - xorl %eax,%eax - sbbq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 - - @@ -1215,9 +883,6 @@ _ecp_nistz256_from_mont: .p2align 5 _ecp_nistz256_select_w5: - movl _OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz L$avx2_select_w5 movdqa L$One(%rip),%xmm0 movd %edx,%xmm1 @@ -1277,9 +942,6 @@ L$select_loop_sse_w5: .p2align 5 _ecp_nistz256_select_w7: - movl _OPENSSL_ia32cap_P+8(%rip),%eax - testl $32,%eax - jnz L$avx2_select_w7 movdqa L$One(%rip),%xmm8 movd %edx,%xmm1 @@ -1321,141 +983,11 @@ L$select_loop_sse_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 - - - -.p2align 5 -ecp_nistz256_avx2_select_w5: -L$avx2_select_w5: - vzeroupper - vmovdqa L$Two(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa L$One(%rip),%ymm5 - vmovdqa L$Two(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -L$select_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz L$select_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 - - - - .globl _ecp_nistz256_avx2_select_w7 .p2align 5 _ecp_nistz256_avx2_select_w7: -L$avx2_select_w7: - vzeroupper - vmovdqa L$Three(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa L$One(%rip),%ymm4 - vmovdqa L$Two(%rip),%ymm8 - vmovdqa L$Three(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -L$select_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz L$select_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper +.byte 0x0f,0x0b .byte 0xf3,0xc3 @@ -1581,10 +1113,6 @@ __ecp_nistz256_mul_by_2q: .p2align 5 _ecp_nistz256_point_double: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je L$point_doublex pushq %rbp pushq %rbx pushq %r12 @@ -1593,6 +1121,7 @@ _ecp_nistz256_point_double: pushq %r15 subq $160+8,%rsp +L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -1786,10 +1315,6 @@ _ecp_nistz256_point_double: .p2align 5 _ecp_nistz256_point_add: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je L$point_addx pushq %rbp pushq %rbx pushq %r12 @@ -1817,7 +1342,7 @@ _ecp_nistz256_point_add: por %xmm1,%xmm3 movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1827,7 +1352,7 @@ _ecp_nistz256_point_add: movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1847,10 +1372,10 @@ _ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 @@ -1859,6 +1384,7 @@ _ecp_nistz256_point_add: movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi @@ -1950,7 +1476,7 @@ _ecp_nistz256_point_add: testq %r8,%r8 jnz L$add_proceedq testq %r9,%r9 - jz L$add_proceedq + jz L$add_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 @@ -1962,6 +1488,13 @@ _ecp_nistz256_point_add: movdqu %xmm0,80(%rdi) jmp L$add_doneq +.p2align 5 +L$add_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp L$point_double_shortcutq + .p2align 5 L$add_proceedq: movq 0+64(%rsp),%rax @@ -2179,10 +1712,6 @@ L$add_doneq: .p2align 5 _ecp_nistz256_point_add_affine: - movl $524544,%ecx - andl _OPENSSL_ia32cap_P+8(%rip),%ecx - cmpl $524544,%ecx - je L$point_add_affinex pushq %rbp pushq %rbx pushq %r12 @@ -2213,13 +1742,13 @@ _ecp_nistz256_point_add_affine: por %xmm1,%xmm3 movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -2235,13 +1764,13 @@ _ecp_nistz256_point_add_affine: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 @@ -2481,1023 +2010,3 @@ _ecp_nistz256_point_add_affine: popq %rbx popq %rbp .byte 0xf3,0xc3 - - -.p2align 5 -__ecp_nistz256_add_tox: - xorq %r11,%r11 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_sub_fromx: - xorq %r11,%r11 - sbbq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq $0,%r11 - - xorq %r10,%r10 - adcq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_subx: - xorq %r11,%r11 - sbbq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq $0,%r11 - - xorq %r9,%r9 - adcq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 - - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_mul_by_2x: - xorq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - -.p2align 5 -ecp_nistz256_point_doublex: -L$point_doublex: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $160+8,%rsp - - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq L$poly+8(%rip),%r14 - movq L$poly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 32(%rbx),%rdx - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rdx - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 0+32(%rsp),%rdx - movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subx - - movq 32(%rsp),%rdx - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx - - addq $160+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 - - -.p2align 5 -ecp_nistz256_point_addx: -L$point_addx: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576+8,%rsp - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - por %xmm0,%xmm1 - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - por %xmm2,%xmm3 - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm1,%xmm3 - - movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 - - leaq 64-128(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 416(%rsp),%rdx - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 512(%rsp),%rdx - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 480(%rsp),%rdx - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 0x3e - jnz L$add_proceedx -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - testq %r8,%r8 - jnz L$add_proceedx - testq %r9,%r9 - jz L$add_proceedx - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp L$add_donex - -.p2align 5 -L$add_proceedx: - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0(%rsp),%rdx - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 160(%rsp),%rdx - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - movq 0(%rsi),%rax - cmovzq %rbp,%r13 - movq 8(%rsi),%rbp - cmovzq %rcx,%r8 - movq 16(%rsi),%rcx - cmovzq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -L$add_donex: - addq $576+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 - - -.p2align 5 -ecp_nistz256_point_add_affinex: -L$point_add_affinex: - pushq %rbp - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $480+8,%rsp - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - por %xmm0,%xmm1 - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - por %xmm2,%xmm3 - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm1,%xmm3 - - movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 - movq 0(%rbx),%rdx - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-128(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+96(%rsp),%rdx - movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - movq 0(%rsi),%rax - cmovzq %rbp,%r13 - movq 8(%rsi),%rbp - cmovzq %rcx,%r8 - movq 16(%rsi),%rcx - cmovzq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rdx - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand L$ONE_mont(%rip),%xmm2 - pand L$ONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - addq $480+8,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - popq %rbp - .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s b/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s index 86665d6e995628..e2bf1bb53a19be 100644 --- a/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/modes/aesni-gcm-x86_64.s @@ -1,753 +1,14 @@ .text - -.p2align 5 -_aesni_ctr32_ghash_6x: - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp L$oop6x - -.p2align 5 -L$oop6x: - addl $100663296,%ebx - jc L$handle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -L$resume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $16,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $1,%xmm3,%xmm7,%xmm6 - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $17,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $96,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $16,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $1,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $17,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $16,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $1,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $17,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $16,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $1,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $17,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $16,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $1,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $17,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $16,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $1,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $17,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $16,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb L$enc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je L$enc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp L$enc_tail - -.p2align 5 -L$handle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp L$resume_ctr32 - -.p2align 5 -L$enc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $16,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $96,%r10 - subq $6,%rdx - jc L$6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp L$oop6x - -L$6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - - .byte 0xf3,0xc3 - -.globl _aesni_gcm_decrypt - -.p2align 5 -_aesni_gcm_decrypt: - xorq %r10,%r10 - cmpq $96,%rdx - jb L$gcm_dec_abort - - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq L$bswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $3968,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc L$dec_no_key_aliasing - cmpq $768,%r15 - jnc L$dec_no_key_aliasing - subq %r15,%rsp -L$dec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - leaq -192(%rdi,%rdx,1),%r15 - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$gcm_dec_abort: - movq %r10,%rax - .byte 0xf3,0xc3 - - -.p2align 5 -_aesni_ctr32_6x: - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc L$handle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp L$oop_ctr32 - -.p2align 4 -L$oop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz L$oop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.p2align 5 -L$handle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp L$oop_ctr32 - - .globl _aesni_gcm_encrypt -.p2align 5 _aesni_gcm_encrypt: - xorq %r10,%r10 - cmpq $288,%rdx - jb L$gcm_enc_abort - - leaq (%rsp),%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq L$bswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $3968,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc L$enc_no_key_aliasing - cmpq $768,%r15 - jnc L$enc_no_key_aliasing - subq %r15,%rsp -L$enc_no_key_aliasing: - - leaq (%rsi),%r14 - leaq -192(%rsi,%rdx,1),%r15 - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $17,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $17,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $16,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $17,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $17,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $16,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $17,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $17,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $16,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $17,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $17,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $16,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $17,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $16,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $17,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $16,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $16,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $17,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0,%xmm0,%xmm8,%xmm5 - vpclmulqdq $17,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $16,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 + xorl %eax,%eax + .byte 0xf3,0xc3 - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $16,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $16,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) +.globl _aesni_gcm_decrypt - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$gcm_enc_abort: - movq %r10,%rax +_aesni_gcm_decrypt: + xorl %eax,%eax .byte 0xf3,0xc3 - -.p2align 6 -L$bswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -L$poly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -L$one_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -L$two_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -L$one_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 diff --git a/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s b/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s index 09ac73bc97a349..77fddf934af3dc 100644 --- a/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/modes/ghash-x86_64.s @@ -20,14 +20,14 @@ L$gmult_prologue: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp L$oop1 .p2align 4 L$oop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -43,13 +43,13 @@ L$oop1: js L$break1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -58,19 +58,19 @@ L$oop1: .p2align 4 L$break1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -661,10 +661,10 @@ L$ghash_epilogue: _gcm_init_clmul: L$_init_clmul: movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 + pshufd $0b01001110,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 + pshufd $0b11111111,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 @@ -678,11 +678,11 @@ L$_init_clmul: pxor %xmm5,%xmm2 - pshufd $78,%xmm2,%xmm6 + pshufd $0b01001110,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -718,8 +718,8 @@ L$_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm2,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 @@ -727,7 +727,7 @@ L$_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -765,7 +765,7 @@ L$_init_clmul: pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -801,8 +801,8 @@ L$_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm5,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 @@ -822,7 +822,7 @@ L$_gmult_clmul: movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -874,20 +874,20 @@ L$_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail movdqu 16(%rsi),%xmm6 movl _OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb L$skip4x andl $71303168,%eax cmpl $4194304,%eax je L$skip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -899,14 +899,14 @@ L$_ghash_clmul: .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 @@ -921,12 +921,12 @@ L$_ghash_clmul: .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 @@ -934,7 +934,7 @@ L$_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc L$tail4x jmp L$mod4_loop @@ -949,14 +949,14 @@ L$mod4_loop: movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 @@ -1000,7 +1000,7 @@ L$mod4_loop: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 @@ -1010,14 +1010,14 @@ L$mod4_loop: movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc L$mod4_loop L$tail4x: @@ -1061,10 +1061,10 @@ L$tail4x: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz L$done movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail L$skip4x: @@ -1079,7 +1079,7 @@ L$skip4x: pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 @@ -1087,7 +1087,7 @@ L$skip4x: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe L$even_tail nop jmp L$mod_loop @@ -1096,7 +1096,7 @@ L$skip4x: L$mod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1134,7 +1134,7 @@ L$mod_loop: pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 + pshufd $0b01001110,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 @@ -1150,13 +1150,13 @@ L$mod_loop: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja L$mod_loop L$even_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1204,7 +1204,7 @@ L$odd_tail: .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -1249,108 +1249,7 @@ L$done: .p2align 5 _gcm_init_avx: - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp L$init_start_avx -.p2align 5 -L$init_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -L$init_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz L$init_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 + jmp L$_init_clmul .globl _gcm_gmult_avx @@ -1362,377 +1261,7 @@ _gcm_gmult_avx: .p2align 5 _gcm_ghash_avx: - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq L$0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu L$bswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $128,%rcx - jb L$short_avx - subq $128,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $16,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $128,%rcx - jb L$tail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $128,%rcx - jmp L$oop8x_avx - -.p2align 5 -L$oop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $16,(%r10),%xmm10,%xmm10 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $17,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $16,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $16,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $17,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $16,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $128,%rcx - jnc L$oop8x_avx - - addq $128,%rcx - jmp L$tail_no_xor_avx - -.p2align 5 -L$short_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $16,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $16,%rcx - jmp L$tail_avx - -.p2align 5 -L$tail_avx: - vpxor %xmm10,%xmm15,%xmm15 -L$tail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $17,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $16,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $16,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne L$short_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 + jmp L$_ghash_clmul .p2align 6 L$bswap_mask: diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s index c164fc3c42bb38..a0de51655d1d3a 100644 --- a/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/sha/sha1-mb-x86_64.s @@ -9,8 +9,6 @@ _sha1_multi_block: movq _OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax pushq %rbx pushq %rbp @@ -2601,10 +2599,10 @@ L$oop_grande_shaext: punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 - pshufd $63,%xmm7,%xmm1 - pshufd $127,%xmm7,%xmm9 - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00111111,%xmm7,%xmm1 + pshufd $0b01111111,%xmm7,%xmm9 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 jmp L$oop_shaext .p2align 5 @@ -2859,8 +2857,8 @@ L$oop_shaext: .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 - pshufd $0,%xmm6,%xmm11 - pshufd $85,%xmm6,%xmm12 + pshufd $0x00,%xmm6,%xmm11 + pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 @@ -2890,8 +2888,8 @@ L$oop_shaext: movl 280(%rsp),%edx - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 @@ -2920,4291 +2918,6 @@ L$epilogue_shaext: .byte 0xf3,0xc3 -.p2align 5 -sha1_multi_block_avx: -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb L$avx - testl $32,%ecx - jnz _avx2_shortcut - jmp L$avx -.p2align 5 -L$avx: - movq %rsp,%rax - pushq %rbx - pushq %rbp - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -L$body_avx: - leaq K_XX_XX(%rip),%rbp - leaq 256(%rsp),%rbx - - vzeroupper -L$oop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - movq 0(%rsi),%r8 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - movq 16(%rsi),%r9 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - movq 32(%rsi),%r10 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - movq 48(%rsi),%r11 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz L$done_avx - - vmovdqu 0(%rdi),%xmm10 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%xmm11 - vmovdqu 64(%rdi),%xmm12 - vmovdqu 96(%rdi),%xmm13 - vmovdqu 128(%rdi),%xmm14 - vmovdqu 96(%rbp),%xmm5 - jmp L$oop_avx - -.p2align 5 -L$oop_avx: - vmovdqa -32(%rbp),%xmm15 - vmovd (%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd (%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vmovd -60(%r8),%xmm1 - vpunpckldq %xmm2,%xmm0,%xmm0 - vmovd -60(%r9),%xmm9 - vpshufb %xmm5,%xmm0,%xmm0 - vpinsrd $1,-60(%r10),%xmm1,%xmm1 - vpinsrd $1,-60(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,0-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -56(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -56(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-56(%r10),%xmm2,%xmm2 - vpinsrd $1,-56(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,16-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -52(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -52(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-52(%r10),%xmm3,%xmm3 - vpinsrd $1,-52(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,32-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -48(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -48(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm4,%xmm4 - vpinsrd $1,-48(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,48-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -44(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -44(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-44(%r10),%xmm0,%xmm0 - vpinsrd $1,-44(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,64-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -40(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -40(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-40(%r10),%xmm1,%xmm1 - vpinsrd $1,-40(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,80-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -36(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -36(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-36(%r10),%xmm2,%xmm2 - vpinsrd $1,-36(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,96-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -32(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -32(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-32(%r10),%xmm3,%xmm3 - vpinsrd $1,-32(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,112-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -28(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -28(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm4,%xmm4 - vpinsrd $1,-28(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,128-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -24(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -24(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpinsrd $1,-24(%r10),%xmm0,%xmm0 - vpinsrd $1,-24(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,144-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -20(%r8),%xmm1 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -20(%r9),%xmm9 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpinsrd $1,-20(%r10),%xmm1,%xmm1 - vpinsrd $1,-20(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,160-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpunpckldq %xmm9,%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -16(%r8),%xmm2 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -16(%r9),%xmm9 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpshufb %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpinsrd $1,-16(%r10),%xmm2,%xmm2 - vpinsrd $1,-16(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,176-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpunpckldq %xmm9,%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -12(%r8),%xmm3 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -12(%r9),%xmm9 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpshufb %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpinsrd $1,-12(%r10),%xmm3,%xmm3 - vpinsrd $1,-12(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,192-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpunpckldq %xmm9,%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -8(%r8),%xmm4 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -8(%r9),%xmm9 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpshufb %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm4,%xmm4 - vpinsrd $1,-8(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,208-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpunpckldq %xmm9,%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vmovd -4(%r8),%xmm0 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vmovd -4(%r9),%xmm9 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpshufb %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vmovdqa 0-128(%rax),%xmm1 - vpinsrd $1,-4(%r10),%xmm0,%xmm0 - vpinsrd $1,-4(%r11),%xmm9,%xmm9 - vpaddd %xmm15,%xmm10,%xmm10 - prefetcht0 63(%r8) - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,224-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpunpckldq %xmm9,%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - prefetcht0 63(%r9) - vpxor %xmm7,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - prefetcht0 63(%r10) - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - prefetcht0 63(%r11) - vpshufb %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 16-128(%rax),%xmm2 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 32-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpandn %xmm13,%xmm11,%xmm7 - - vpand %xmm12,%xmm11,%xmm6 - - vmovdqa %xmm0,240-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 128-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 48-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpandn %xmm12,%xmm10,%xmm7 - - vpand %xmm11,%xmm10,%xmm6 - - vmovdqa %xmm1,0-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 144-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 64-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpandn %xmm11,%xmm14,%xmm7 - - vpand %xmm10,%xmm14,%xmm6 - - vmovdqa %xmm2,16-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 160-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 80-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpandn %xmm10,%xmm13,%xmm7 - - vpand %xmm14,%xmm13,%xmm6 - - vmovdqa %xmm3,32-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 176-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 96-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpandn %xmm14,%xmm12,%xmm7 - - vpand %xmm13,%xmm12,%xmm6 - - vmovdqa %xmm4,48-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 192-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm7,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 0(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 112-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,64-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 208-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 128-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,80-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 224-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 144-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,96-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 240-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 160-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,112-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 0-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 176-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,128-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 16-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 192-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,144-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 32-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 208-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,160-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 48-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 224-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,176-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 64-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 240-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,192-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 80-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 0-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,208-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 96-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 16-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,224-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 112-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 32-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,240-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 128-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 48-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,0-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 144-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 64-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,16-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 160-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 80-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,32-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 176-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 96-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,48-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 192-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 112-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,64-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 208-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 128-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,80-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 224-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 144-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,96-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 240-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 160-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,112-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 0-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 32(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 176-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 16-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,128-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 192-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 32-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,144-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 208-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 48-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,160-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 224-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 64-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,176-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 240-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 80-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,192-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 0-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 96-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,208-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 16-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 112-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,224-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 32-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 128-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,240-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 48-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 144-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,0-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 64-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 160-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,16-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 80-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 176-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,32-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 96-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 192-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,48-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 112-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 208-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,64-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 128-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 224-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,80-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 144-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 240-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,96-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 160-128(%rax),%xmm3 - - vpaddd %xmm15,%xmm14,%xmm14 - vpslld $5,%xmm10,%xmm8 - vpand %xmm12,%xmm13,%xmm7 - vpxor 0-128(%rax),%xmm1,%xmm1 - - vpaddd %xmm7,%xmm14,%xmm14 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm13,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vmovdqu %xmm0,112-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm1,%xmm5 - vpand %xmm11,%xmm6,%xmm6 - vpaddd %xmm1,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpaddd %xmm6,%xmm14,%xmm14 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 176-128(%rax),%xmm4 - - vpaddd %xmm15,%xmm13,%xmm13 - vpslld $5,%xmm14,%xmm8 - vpand %xmm11,%xmm12,%xmm7 - vpxor 16-128(%rax),%xmm2,%xmm2 - - vpaddd %xmm7,%xmm13,%xmm13 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm12,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vmovdqu %xmm1,128-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm2,%xmm5 - vpand %xmm10,%xmm6,%xmm6 - vpaddd %xmm2,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpaddd %xmm6,%xmm13,%xmm13 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 192-128(%rax),%xmm0 - - vpaddd %xmm15,%xmm12,%xmm12 - vpslld $5,%xmm13,%xmm8 - vpand %xmm10,%xmm11,%xmm7 - vpxor 32-128(%rax),%xmm3,%xmm3 - - vpaddd %xmm7,%xmm12,%xmm12 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm11,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vmovdqu %xmm2,144-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm3,%xmm5 - vpand %xmm14,%xmm6,%xmm6 - vpaddd %xmm3,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpaddd %xmm6,%xmm12,%xmm12 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 208-128(%rax),%xmm1 - - vpaddd %xmm15,%xmm11,%xmm11 - vpslld $5,%xmm12,%xmm8 - vpand %xmm14,%xmm10,%xmm7 - vpxor 48-128(%rax),%xmm4,%xmm4 - - vpaddd %xmm7,%xmm11,%xmm11 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm10,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqu %xmm3,160-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm4,%xmm5 - vpand %xmm13,%xmm6,%xmm6 - vpaddd %xmm4,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpaddd %xmm6,%xmm11,%xmm11 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 224-128(%rax),%xmm2 - - vpaddd %xmm15,%xmm10,%xmm10 - vpslld $5,%xmm11,%xmm8 - vpand %xmm13,%xmm14,%xmm7 - vpxor 64-128(%rax),%xmm0,%xmm0 - - vpaddd %xmm7,%xmm10,%xmm10 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm14,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vmovdqu %xmm4,176-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpor %xmm9,%xmm8,%xmm8 - vpsrld $31,%xmm0,%xmm5 - vpand %xmm12,%xmm6,%xmm6 - vpaddd %xmm0,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vmovdqa 64(%rbp),%xmm15 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 240-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,192-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 80-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 0-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,208-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 96-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 16-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,224-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 112-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 32-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,240-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 128-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 48-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,0-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 144-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 64-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,16-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 160-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 80-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,32-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 176-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 96-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vmovdqa %xmm2,48-128(%rax) - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 192-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 112-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vmovdqa %xmm3,64-128(%rax) - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 208-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 128-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vmovdqa %xmm4,80-128(%rax) - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 224-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 144-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vmovdqa %xmm0,96-128(%rax) - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 240-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 160-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vmovdqa %xmm1,112-128(%rax) - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 0-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 176-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 16-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 192-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 32-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa 208-128(%rax),%xmm2 - - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor 48-128(%rax),%xmm0,%xmm0 - vpsrld $27,%xmm11,%xmm9 - vpxor %xmm13,%xmm6,%xmm6 - vpxor %xmm2,%xmm0,%xmm0 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - vpsrld $31,%xmm0,%xmm5 - vpaddd %xmm0,%xmm0,%xmm0 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm5,%xmm0,%xmm0 - vpor %xmm7,%xmm12,%xmm12 - vpxor %xmm3,%xmm1,%xmm1 - vmovdqa 224-128(%rax),%xmm3 - - vpslld $5,%xmm10,%xmm8 - vpaddd %xmm15,%xmm14,%xmm14 - vpxor %xmm11,%xmm13,%xmm6 - vpaddd %xmm0,%xmm14,%xmm14 - vpxor 64-128(%rax),%xmm1,%xmm1 - vpsrld $27,%xmm10,%xmm9 - vpxor %xmm12,%xmm6,%xmm6 - vpxor %xmm3,%xmm1,%xmm1 - - vpslld $30,%xmm11,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm14,%xmm14 - vpsrld $31,%xmm1,%xmm5 - vpaddd %xmm1,%xmm1,%xmm1 - - vpsrld $2,%xmm11,%xmm11 - vpaddd %xmm8,%xmm14,%xmm14 - vpor %xmm5,%xmm1,%xmm1 - vpor %xmm7,%xmm11,%xmm11 - vpxor %xmm4,%xmm2,%xmm2 - vmovdqa 240-128(%rax),%xmm4 - - vpslld $5,%xmm14,%xmm8 - vpaddd %xmm15,%xmm13,%xmm13 - vpxor %xmm10,%xmm12,%xmm6 - vpaddd %xmm1,%xmm13,%xmm13 - vpxor 80-128(%rax),%xmm2,%xmm2 - vpsrld $27,%xmm14,%xmm9 - vpxor %xmm11,%xmm6,%xmm6 - vpxor %xmm4,%xmm2,%xmm2 - - vpslld $30,%xmm10,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm13,%xmm13 - vpsrld $31,%xmm2,%xmm5 - vpaddd %xmm2,%xmm2,%xmm2 - - vpsrld $2,%xmm10,%xmm10 - vpaddd %xmm8,%xmm13,%xmm13 - vpor %xmm5,%xmm2,%xmm2 - vpor %xmm7,%xmm10,%xmm10 - vpxor %xmm0,%xmm3,%xmm3 - vmovdqa 0-128(%rax),%xmm0 - - vpslld $5,%xmm13,%xmm8 - vpaddd %xmm15,%xmm12,%xmm12 - vpxor %xmm14,%xmm11,%xmm6 - vpaddd %xmm2,%xmm12,%xmm12 - vpxor 96-128(%rax),%xmm3,%xmm3 - vpsrld $27,%xmm13,%xmm9 - vpxor %xmm10,%xmm6,%xmm6 - vpxor %xmm0,%xmm3,%xmm3 - - vpslld $30,%xmm14,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - vpsrld $31,%xmm3,%xmm5 - vpaddd %xmm3,%xmm3,%xmm3 - - vpsrld $2,%xmm14,%xmm14 - vpaddd %xmm8,%xmm12,%xmm12 - vpor %xmm5,%xmm3,%xmm3 - vpor %xmm7,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqa 16-128(%rax),%xmm1 - - vpslld $5,%xmm12,%xmm8 - vpaddd %xmm15,%xmm11,%xmm11 - vpxor %xmm13,%xmm10,%xmm6 - vpaddd %xmm3,%xmm11,%xmm11 - vpxor 112-128(%rax),%xmm4,%xmm4 - vpsrld $27,%xmm12,%xmm9 - vpxor %xmm14,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm4 - - vpslld $30,%xmm13,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm11,%xmm11 - vpsrld $31,%xmm4,%xmm5 - vpaddd %xmm4,%xmm4,%xmm4 - - vpsrld $2,%xmm13,%xmm13 - vpaddd %xmm8,%xmm11,%xmm11 - vpor %xmm5,%xmm4,%xmm4 - vpor %xmm7,%xmm13,%xmm13 - vpslld $5,%xmm11,%xmm8 - vpaddd %xmm15,%xmm10,%xmm10 - vpxor %xmm12,%xmm14,%xmm6 - - vpsrld $27,%xmm11,%xmm9 - vpaddd %xmm4,%xmm10,%xmm10 - vpxor %xmm13,%xmm6,%xmm6 - - vpslld $30,%xmm12,%xmm7 - vpor %xmm9,%xmm8,%xmm8 - vpaddd %xmm6,%xmm10,%xmm10 - - vpsrld $2,%xmm12,%xmm12 - vpaddd %xmm8,%xmm10,%xmm10 - vpor %xmm7,%xmm12,%xmm12 - movl $1,%ecx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%xmm6 - vpxor %xmm8,%xmm8,%xmm8 - vmovdqa %xmm6,%xmm7 - vpcmpgtd %xmm8,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - - vpand %xmm7,%xmm10,%xmm10 - vpand %xmm7,%xmm11,%xmm11 - vpaddd 0(%rdi),%xmm10,%xmm10 - vpand %xmm7,%xmm12,%xmm12 - vpaddd 32(%rdi),%xmm11,%xmm11 - vpand %xmm7,%xmm13,%xmm13 - vpaddd 64(%rdi),%xmm12,%xmm12 - vpand %xmm7,%xmm14,%xmm14 - vpaddd 96(%rdi),%xmm13,%xmm13 - vpaddd 128(%rdi),%xmm14,%xmm14 - vmovdqu %xmm10,0(%rdi) - vmovdqu %xmm11,32(%rdi) - vmovdqu %xmm12,64(%rdi) - vmovdqu %xmm13,96(%rdi) - vmovdqu %xmm14,128(%rdi) - - vmovdqu %xmm6,(%rbx) - vmovdqu 96(%rbp),%xmm5 - decl %edx - jnz L$oop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz L$oop_grande_avx - -L$done_avx: - movq 272(%rsp),%rax - vzeroupper - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 5 -sha1_multi_block_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -L$body_avx2: - leaq K_XX_XX(%rip),%rbp - shrl $1,%edx - - vzeroupper -L$oop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - movq 0(%rsi),%r12 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - movq 16(%rsi),%r13 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - movq 32(%rsi),%r14 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - movq 48(%rsi),%r15 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - movq 64(%rsi),%r8 - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - movq 80(%rsi),%r9 - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - movq 96(%rsi),%r10 - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - movq 112(%rsi),%r11 - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0(%rdi),%ymm0 - leaq 128(%rsp),%rax - vmovdqu 32(%rdi),%ymm1 - leaq 256+128(%rsp),%rbx - vmovdqu 64(%rdi),%ymm2 - vmovdqu 96(%rdi),%ymm3 - vmovdqu 128(%rdi),%ymm4 - vmovdqu 96(%rbp),%ymm9 - jmp L$oop_avx2 - -.p2align 5 -L$oop_avx2: - vmovdqa -32(%rbp),%ymm15 - vmovd (%r12),%xmm10 - leaq 64(%r12),%r12 - vmovd (%r8),%xmm12 - leaq 64(%r8),%r8 - vmovd (%r13),%xmm7 - leaq 64(%r13),%r13 - vmovd (%r9),%xmm6 - leaq 64(%r9),%r9 - vpinsrd $1,(%r14),%xmm10,%xmm10 - leaq 64(%r14),%r14 - vpinsrd $1,(%r10),%xmm12,%xmm12 - leaq 64(%r10),%r10 - vpinsrd $1,(%r15),%xmm7,%xmm7 - leaq 64(%r15),%r15 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,(%r11),%xmm6,%xmm6 - leaq 64(%r11),%r11 - vpunpckldq %ymm6,%ymm12,%ymm12 - vmovd -60(%r12),%xmm11 - vinserti128 $1,%xmm12,%ymm10,%ymm10 - vmovd -60(%r8),%xmm8 - vpshufb %ymm9,%ymm10,%ymm10 - vmovd -60(%r13),%xmm7 - vmovd -60(%r9),%xmm6 - vpinsrd $1,-60(%r14),%xmm11,%xmm11 - vpinsrd $1,-60(%r10),%xmm8,%xmm8 - vpinsrd $1,-60(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-60(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,0-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -56(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -56(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -56(%r13),%xmm7 - vmovd -56(%r9),%xmm6 - vpinsrd $1,-56(%r14),%xmm12,%xmm12 - vpinsrd $1,-56(%r10),%xmm8,%xmm8 - vpinsrd $1,-56(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-56(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,32-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -52(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -52(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -52(%r13),%xmm7 - vmovd -52(%r9),%xmm6 - vpinsrd $1,-52(%r14),%xmm13,%xmm13 - vpinsrd $1,-52(%r10),%xmm8,%xmm8 - vpinsrd $1,-52(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-52(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,64-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -48(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -48(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -48(%r13),%xmm7 - vmovd -48(%r9),%xmm6 - vpinsrd $1,-48(%r14),%xmm14,%xmm14 - vpinsrd $1,-48(%r10),%xmm8,%xmm8 - vpinsrd $1,-48(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-48(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,96-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -44(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -44(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -44(%r13),%xmm7 - vmovd -44(%r9),%xmm6 - vpinsrd $1,-44(%r14),%xmm10,%xmm10 - vpinsrd $1,-44(%r10),%xmm8,%xmm8 - vpinsrd $1,-44(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-44(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,128-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -40(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -40(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -40(%r13),%xmm7 - vmovd -40(%r9),%xmm6 - vpinsrd $1,-40(%r14),%xmm11,%xmm11 - vpinsrd $1,-40(%r10),%xmm8,%xmm8 - vpinsrd $1,-40(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-40(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,160-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -36(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -36(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -36(%r13),%xmm7 - vmovd -36(%r9),%xmm6 - vpinsrd $1,-36(%r14),%xmm12,%xmm12 - vpinsrd $1,-36(%r10),%xmm8,%xmm8 - vpinsrd $1,-36(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-36(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,192-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -32(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -32(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -32(%r13),%xmm7 - vmovd -32(%r9),%xmm6 - vpinsrd $1,-32(%r14),%xmm13,%xmm13 - vpinsrd $1,-32(%r10),%xmm8,%xmm8 - vpinsrd $1,-32(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-32(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,224-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -28(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -28(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -28(%r13),%xmm7 - vmovd -28(%r9),%xmm6 - vpinsrd $1,-28(%r14),%xmm14,%xmm14 - vpinsrd $1,-28(%r10),%xmm8,%xmm8 - vpinsrd $1,-28(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-28(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,256-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -24(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -24(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovd -24(%r13),%xmm7 - vmovd -24(%r9),%xmm6 - vpinsrd $1,-24(%r14),%xmm10,%xmm10 - vpinsrd $1,-24(%r10),%xmm8,%xmm8 - vpinsrd $1,-24(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-24(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,288-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -20(%r12),%xmm11 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -20(%r8),%xmm8 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovd -20(%r13),%xmm7 - vmovd -20(%r9),%xmm6 - vpinsrd $1,-20(%r14),%xmm11,%xmm11 - vpinsrd $1,-20(%r10),%xmm8,%xmm8 - vpinsrd $1,-20(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm11,%ymm11 - vpinsrd $1,-20(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,320-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vinserti128 $1,%xmm8,%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -16(%r12),%xmm12 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -16(%r8),%xmm8 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpshufb %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vmovd -16(%r13),%xmm7 - vmovd -16(%r9),%xmm6 - vpinsrd $1,-16(%r14),%xmm12,%xmm12 - vpinsrd $1,-16(%r10),%xmm8,%xmm8 - vpinsrd $1,-16(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm12,%ymm12 - vpinsrd $1,-16(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,352-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vinserti128 $1,%xmm8,%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -12(%r12),%xmm13 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -12(%r8),%xmm8 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpshufb %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vmovd -12(%r13),%xmm7 - vmovd -12(%r9),%xmm6 - vpinsrd $1,-12(%r14),%xmm13,%xmm13 - vpinsrd $1,-12(%r10),%xmm8,%xmm8 - vpinsrd $1,-12(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm13,%ymm13 - vpinsrd $1,-12(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,384-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vinserti128 $1,%xmm8,%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -8(%r12),%xmm14 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -8(%r8),%xmm8 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpshufb %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vmovd -8(%r13),%xmm7 - vmovd -8(%r9),%xmm6 - vpinsrd $1,-8(%r14),%xmm14,%xmm14 - vpinsrd $1,-8(%r10),%xmm8,%xmm8 - vpinsrd $1,-8(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm14,%ymm14 - vpinsrd $1,-8(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,416-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vinserti128 $1,%xmm8,%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vmovd -4(%r12),%xmm10 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vmovd -4(%r8),%xmm8 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpshufb %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vmovdqa 0-128(%rax),%ymm11 - vmovd -4(%r13),%xmm7 - vmovd -4(%r9),%xmm6 - vpinsrd $1,-4(%r14),%xmm10,%xmm10 - vpinsrd $1,-4(%r10),%xmm8,%xmm8 - vpinsrd $1,-4(%r15),%xmm7,%xmm7 - vpunpckldq %ymm7,%ymm10,%ymm10 - vpinsrd $1,-4(%r11),%xmm6,%xmm6 - vpunpckldq %ymm6,%ymm8,%ymm8 - vpaddd %ymm15,%ymm0,%ymm0 - prefetcht0 63(%r12) - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,448-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vinserti128 $1,%xmm8,%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - prefetcht0 63(%r13) - vpxor %ymm6,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - prefetcht0 63(%r15) - vpshufb %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32-128(%rax),%ymm12 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 64-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpandn %ymm3,%ymm1,%ymm6 - prefetcht0 63(%r8) - vpand %ymm2,%ymm1,%ymm5 - - vmovdqa %ymm10,480-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 256-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - prefetcht0 63(%r9) - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - prefetcht0 63(%r10) - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - prefetcht0 63(%r11) - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 96-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpandn %ymm2,%ymm0,%ymm6 - - vpand %ymm1,%ymm0,%ymm5 - - vmovdqa %ymm11,0-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 288-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 128-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpandn %ymm1,%ymm4,%ymm6 - - vpand %ymm0,%ymm4,%ymm5 - - vmovdqa %ymm12,32-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 320-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 160-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpandn %ymm0,%ymm3,%ymm6 - - vpand %ymm4,%ymm3,%ymm5 - - vmovdqa %ymm13,64-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 352-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 192-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpandn %ymm4,%ymm2,%ymm6 - - vpand %ymm3,%ymm2,%ymm5 - - vmovdqa %ymm14,96-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 384-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm6,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 0(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 224-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,128-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 416-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 256-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,160-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 448-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 288-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,192-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 480-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 320-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,224-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 0-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 352-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,256-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 32-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 384-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,288-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 64-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 416-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,320-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 96-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 448-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,352-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 128-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 480-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,384-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 160-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 0-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,416-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 192-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 32-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,448-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 224-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 64-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,480-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 256-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 96-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,0-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 288-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 128-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,32-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 320-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 160-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,64-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 352-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 192-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,96-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 384-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 224-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,128-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 416-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 256-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,160-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 448-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 288-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,192-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 480-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 320-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,224-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 0-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 32(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 352-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 32-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,256-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 384-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 64-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,288-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 416-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 96-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,320-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 448-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 128-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,352-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 480-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 160-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,384-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 0-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 192-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,416-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 32-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 224-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,448-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 64-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 256-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,480-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 96-128(%rax),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 288-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,0-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 128-128(%rax),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 320-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,32-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 160-128(%rax),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 352-256-128(%rbx),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,64-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 192-128(%rax),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 384-256-128(%rbx),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,96-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 224-128(%rax),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 416-256-128(%rbx),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,128-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 256-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 448-256-128(%rbx),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,160-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 288-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 480-256-128(%rbx),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,192-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 320-256-128(%rbx),%ymm13 - - vpaddd %ymm15,%ymm4,%ymm4 - vpslld $5,%ymm0,%ymm7 - vpand %ymm2,%ymm3,%ymm6 - vpxor 0-128(%rax),%ymm11,%ymm11 - - vpaddd %ymm6,%ymm4,%ymm4 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm3,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vmovdqu %ymm10,224-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm11,%ymm9 - vpand %ymm1,%ymm5,%ymm5 - vpaddd %ymm11,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpaddd %ymm5,%ymm4,%ymm4 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 352-256-128(%rbx),%ymm14 - - vpaddd %ymm15,%ymm3,%ymm3 - vpslld $5,%ymm4,%ymm7 - vpand %ymm1,%ymm2,%ymm6 - vpxor 32-128(%rax),%ymm12,%ymm12 - - vpaddd %ymm6,%ymm3,%ymm3 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm2,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vmovdqu %ymm11,256-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm12,%ymm9 - vpand %ymm0,%ymm5,%ymm5 - vpaddd %ymm12,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpaddd %ymm5,%ymm3,%ymm3 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 384-256-128(%rbx),%ymm10 - - vpaddd %ymm15,%ymm2,%ymm2 - vpslld $5,%ymm3,%ymm7 - vpand %ymm0,%ymm1,%ymm6 - vpxor 64-128(%rax),%ymm13,%ymm13 - - vpaddd %ymm6,%ymm2,%ymm2 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm1,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vmovdqu %ymm12,288-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm13,%ymm9 - vpand %ymm4,%ymm5,%ymm5 - vpaddd %ymm13,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpaddd %ymm5,%ymm2,%ymm2 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 416-256-128(%rbx),%ymm11 - - vpaddd %ymm15,%ymm1,%ymm1 - vpslld $5,%ymm2,%ymm7 - vpand %ymm4,%ymm0,%ymm6 - vpxor 96-128(%rax),%ymm14,%ymm14 - - vpaddd %ymm6,%ymm1,%ymm1 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm0,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vmovdqu %ymm13,320-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm14,%ymm9 - vpand %ymm3,%ymm5,%ymm5 - vpaddd %ymm14,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 448-256-128(%rbx),%ymm12 - - vpaddd %ymm15,%ymm0,%ymm0 - vpslld $5,%ymm1,%ymm7 - vpand %ymm3,%ymm4,%ymm6 - vpxor 128-128(%rax),%ymm10,%ymm10 - - vpaddd %ymm6,%ymm0,%ymm0 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm4,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vmovdqu %ymm14,352-256-128(%rbx) - vpaddd %ymm14,%ymm0,%ymm0 - vpor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm10,%ymm9 - vpand %ymm2,%ymm5,%ymm5 - vpaddd %ymm10,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vmovdqa 64(%rbp),%ymm15 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 480-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,384-256-128(%rbx) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 160-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 0-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,416-256-128(%rbx) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 192-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 32-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,448-256-128(%rbx) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 224-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 64-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,480-256-128(%rbx) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 256-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 96-128(%rax),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,0-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 288-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 128-128(%rax),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,32-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 320-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 160-128(%rax),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,64-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 352-256-128(%rbx),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 192-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vmovdqa %ymm12,96-128(%rax) - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 384-256-128(%rbx),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 224-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vmovdqa %ymm13,128-128(%rax) - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 416-256-128(%rbx),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 256-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vmovdqa %ymm14,160-128(%rax) - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 448-256-128(%rbx),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 288-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vmovdqa %ymm10,192-128(%rax) - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 480-256-128(%rbx),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 320-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vmovdqa %ymm11,224-128(%rax) - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 0-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 352-256-128(%rbx),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 32-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 384-256-128(%rbx),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 64-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpxor %ymm12,%ymm10,%ymm10 - vmovdqa 416-256-128(%rbx),%ymm12 - - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor 96-128(%rax),%ymm10,%ymm10 - vpsrld $27,%ymm1,%ymm8 - vpxor %ymm3,%ymm5,%ymm5 - vpxor %ymm12,%ymm10,%ymm10 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpsrld $31,%ymm10,%ymm9 - vpaddd %ymm10,%ymm10,%ymm10 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm9,%ymm10,%ymm10 - vpor %ymm6,%ymm2,%ymm2 - vpxor %ymm13,%ymm11,%ymm11 - vmovdqa 448-256-128(%rbx),%ymm13 - - vpslld $5,%ymm0,%ymm7 - vpaddd %ymm15,%ymm4,%ymm4 - vpxor %ymm1,%ymm3,%ymm5 - vpaddd %ymm10,%ymm4,%ymm4 - vpxor 128-128(%rax),%ymm11,%ymm11 - vpsrld $27,%ymm0,%ymm8 - vpxor %ymm2,%ymm5,%ymm5 - vpxor %ymm13,%ymm11,%ymm11 - - vpslld $30,%ymm1,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm4,%ymm4 - vpsrld $31,%ymm11,%ymm9 - vpaddd %ymm11,%ymm11,%ymm11 - - vpsrld $2,%ymm1,%ymm1 - vpaddd %ymm7,%ymm4,%ymm4 - vpor %ymm9,%ymm11,%ymm11 - vpor %ymm6,%ymm1,%ymm1 - vpxor %ymm14,%ymm12,%ymm12 - vmovdqa 480-256-128(%rbx),%ymm14 - - vpslld $5,%ymm4,%ymm7 - vpaddd %ymm15,%ymm3,%ymm3 - vpxor %ymm0,%ymm2,%ymm5 - vpaddd %ymm11,%ymm3,%ymm3 - vpxor 160-128(%rax),%ymm12,%ymm12 - vpsrld $27,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm14,%ymm12,%ymm12 - - vpslld $30,%ymm0,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm3,%ymm3 - vpsrld $31,%ymm12,%ymm9 - vpaddd %ymm12,%ymm12,%ymm12 - - vpsrld $2,%ymm0,%ymm0 - vpaddd %ymm7,%ymm3,%ymm3 - vpor %ymm9,%ymm12,%ymm12 - vpor %ymm6,%ymm0,%ymm0 - vpxor %ymm10,%ymm13,%ymm13 - vmovdqa 0-128(%rax),%ymm10 - - vpslld $5,%ymm3,%ymm7 - vpaddd %ymm15,%ymm2,%ymm2 - vpxor %ymm4,%ymm1,%ymm5 - vpaddd %ymm12,%ymm2,%ymm2 - vpxor 192-128(%rax),%ymm13,%ymm13 - vpsrld $27,%ymm3,%ymm8 - vpxor %ymm0,%ymm5,%ymm5 - vpxor %ymm10,%ymm13,%ymm13 - - vpslld $30,%ymm4,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm2,%ymm2 - vpsrld $31,%ymm13,%ymm9 - vpaddd %ymm13,%ymm13,%ymm13 - - vpsrld $2,%ymm4,%ymm4 - vpaddd %ymm7,%ymm2,%ymm2 - vpor %ymm9,%ymm13,%ymm13 - vpor %ymm6,%ymm4,%ymm4 - vpxor %ymm11,%ymm14,%ymm14 - vmovdqa 32-128(%rax),%ymm11 - - vpslld $5,%ymm2,%ymm7 - vpaddd %ymm15,%ymm1,%ymm1 - vpxor %ymm3,%ymm0,%ymm5 - vpaddd %ymm13,%ymm1,%ymm1 - vpxor 224-128(%rax),%ymm14,%ymm14 - vpsrld $27,%ymm2,%ymm8 - vpxor %ymm4,%ymm5,%ymm5 - vpxor %ymm11,%ymm14,%ymm14 - - vpslld $30,%ymm3,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm1,%ymm1 - vpsrld $31,%ymm14,%ymm9 - vpaddd %ymm14,%ymm14,%ymm14 - - vpsrld $2,%ymm3,%ymm3 - vpaddd %ymm7,%ymm1,%ymm1 - vpor %ymm9,%ymm14,%ymm14 - vpor %ymm6,%ymm3,%ymm3 - vpslld $5,%ymm1,%ymm7 - vpaddd %ymm15,%ymm0,%ymm0 - vpxor %ymm2,%ymm4,%ymm5 - - vpsrld $27,%ymm1,%ymm8 - vpaddd %ymm14,%ymm0,%ymm0 - vpxor %ymm3,%ymm5,%ymm5 - - vpslld $30,%ymm2,%ymm6 - vpor %ymm8,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - - vpsrld $2,%ymm2,%ymm2 - vpaddd %ymm7,%ymm0,%ymm0 - vpor %ymm6,%ymm2,%ymm2 - movl $1,%ecx - leaq 512(%rsp),%rbx - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqu (%rbx),%ymm5 - vpxor %ymm7,%ymm7,%ymm7 - vmovdqa %ymm5,%ymm6 - vpcmpgtd %ymm7,%ymm6,%ymm6 - vpaddd %ymm6,%ymm5,%ymm5 - - vpand %ymm6,%ymm0,%ymm0 - vpand %ymm6,%ymm1,%ymm1 - vpaddd 0(%rdi),%ymm0,%ymm0 - vpand %ymm6,%ymm2,%ymm2 - vpaddd 32(%rdi),%ymm1,%ymm1 - vpand %ymm6,%ymm3,%ymm3 - vpaddd 64(%rdi),%ymm2,%ymm2 - vpand %ymm6,%ymm4,%ymm4 - vpaddd 96(%rdi),%ymm3,%ymm3 - vpaddd 128(%rdi),%ymm4,%ymm4 - vmovdqu %ymm0,0(%rdi) - vmovdqu %ymm1,32(%rdi) - vmovdqu %ymm2,64(%rdi) - vmovdqu %ymm3,96(%rdi) - vmovdqu %ymm4,128(%rdi) - - vmovdqu %ymm5,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu 96(%rbp),%ymm9 - decl %edx - jnz L$oop_avx2 - - - - - - - -L$done_avx2: - movq 544(%rsp),%rax - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 - - .p2align 8 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s index c89ffe3df60f54..798ca0dc4d076a 100644 --- a/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/sha/sha1-x86_64.s @@ -12,14 +12,6 @@ _sha1_block_data_order: jz L$ialu testl $536870912,%r10d jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut jmp _ssse3_shortcut .p2align 4 @@ -1248,9 +1240,9 @@ _shaext_shortcut: movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 + pshufd $0b00011011,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 @@ -1400,8 +1392,8 @@ L$oop_shaext: jnz L$oop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 @@ -2582,2803 +2574,6 @@ L$done_ssse3: L$epilogue_ssse3: .byte 0xf3,0xc3 - -.p2align 4 -sha1_block_data_order_avx: -_avx_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - leaq -64(%rsp),%rsp - vzeroupper - movq %rax,%r14 - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm11,%xmm0,%xmm4 - vpaddd %xmm11,%xmm1,%xmm5 - vpaddd %xmm11,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - jmp L$oop_avx -.p2align 4 -L$oop_avx: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm11,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm10 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm4,%xmm4 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vpxor %xmm10,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm11,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm10 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm10,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vpaddd %xmm5,%xmm11,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm10 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm6,%xmm6 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm10,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm11,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm10 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm10,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm11,%xmm9 - addl %esi,%edx - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - vpaddd %xmm3,%xmm11,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm11,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm11,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm11,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm11,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm11,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm11,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je L$done_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm11,%xmm0,%xmm4 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm11,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm5,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm11,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm6,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp L$oop_avx - -.p2align 4 -L$done_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroupper - - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 4 -sha1_block_data_order_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - vzeroupper - movq %rax,%r14 - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - leaq -640(%rsp),%rsp - shlq $6,%r10 - leaq 64(%r9),%r13 - andq $-128,%rsp - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 - - movl 0(%r8),%eax - cmpq %r10,%r13 - cmovaeq %r9,%r13 - movl 4(%r8),%ebp - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl 16(%r8),%esi - vmovdqu 64(%r11),%ymm6 - - vmovdqu (%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - leaq 64(%r9),%r9 - vinserti128 $1,(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vpshufb %ymm6,%ymm0,%ymm0 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vpshufb %ymm6,%ymm1,%ymm1 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r11),%ymm11 - vpshufb %ymm6,%ymm3,%ymm3 - - vpaddd %ymm11,%ymm0,%ymm4 - vpaddd %ymm11,%ymm1,%ymm5 - vmovdqu %ymm4,0(%rsp) - vpaddd %ymm11,%ymm2,%ymm6 - vmovdqu %ymm5,32(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - vmovdqu %ymm6,64(%rsp) - vmovdqu %ymm7,96(%rsp) - vpalignr $8,%ymm0,%ymm1,%ymm4 - vpsrldq $4,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $31,%ymm4,%ymm8 - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - vpxor %ymm10,%ymm4,%ymm4 - vpaddd %ymm11,%ymm4,%ymm9 - vmovdqu %ymm9,128(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm5 - vpsrldq $4,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm5,%ymm5 - vpaddd %ymm11,%ymm5,%ymm9 - vmovdqu %ymm9,160(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm6 - vpsrldq $4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $31,%ymm6,%ymm8 - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - vpxor %ymm10,%ymm6,%ymm6 - vpaddd %ymm11,%ymm6,%ymm9 - vmovdqu %ymm9,192(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm7 - vpsrldq $4,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm7,%ymm8 - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - vpxor %ymm10,%ymm7,%ymm7 - vpaddd %ymm11,%ymm7,%ymm9 - vmovdqu %ymm9,224(%rsp) - leaq 128(%rsp),%r13 - jmp L$oop_avx2 -.p2align 5 -L$oop_avx2: - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - jmp L$align32_1 -.p2align 5 -L$align32_1: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - vpxor %ymm1,%ymm0,%ymm0 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpxor %ymm8,%ymm0,%ymm0 - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vpor %ymm8,%ymm0,%ymm0 - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - vpaddd %ymm11,%ymm0,%ymm9 - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - vmovdqu %ymm9,256(%rsp) - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - vpxor %ymm2,%ymm1,%ymm1 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpxor %ymm8,%ymm1,%ymm1 - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vpor %ymm8,%ymm1,%ymm1 - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - vpaddd %ymm11,%ymm1,%ymm9 - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vmovdqu %ymm9,288(%rsp) - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r11),%ymm11 - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpxor %ymm8,%ymm2,%ymm2 - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vpor %ymm8,%ymm2,%ymm2 - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - vpaddd %ymm11,%ymm2,%ymm9 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vmovdqu %ymm9,320(%rsp) - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - vpxor %ymm4,%ymm3,%ymm3 - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpxor %ymm8,%ymm3,%ymm3 - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - vpor %ymm8,%ymm3,%ymm3 - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - vpaddd %ymm11,%ymm3,%ymm9 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vmovdqu %ymm9,352(%rsp) - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpalignr $8,%ymm2,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpxor %ymm5,%ymm4,%ymm4 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpxor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - vpsrld $30,%ymm4,%ymm8 - vpslld $2,%ymm4,%ymm4 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpor %ymm8,%ymm4,%ymm4 - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpaddd %ymm11,%ymm4,%ymm9 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - vmovdqu %ymm9,384(%rsp) - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpalignr $8,%ymm3,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm6,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpxor %ymm8,%ymm5,%ymm5 - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - vpsrld $30,%ymm5,%ymm8 - vpslld $2,%ymm5,%ymm5 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vpor %ymm8,%ymm5,%ymm5 - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - vmovdqu %ymm9,416(%rsp) - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm7,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - vpxor %ymm8,%ymm6,%ymm6 - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - vpsrld $30,%ymm6,%ymm8 - vpslld $2,%ymm6,%ymm6 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpor %ymm8,%ymm6,%ymm6 - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - vmovdqu %ymm9,448(%rsp) - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm5,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r11),%ymm11 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpxor %ymm8,%ymm7,%ymm7 - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - vpsrld $30,%ymm7,%ymm8 - vpslld $2,%ymm7,%ymm7 - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpor %ymm8,%ymm7,%ymm7 - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - vmovdqu %ymm9,480(%rsp) - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - jmp L$align32_2 -.p2align 5 -L$align32_2: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -64(%r13),%ebp - xorl %esi,%ecx - vpxor %ymm1,%ymm0,%ymm0 - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - vpxor %ymm8,%ymm0,%ymm0 - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - vpor %ymm8,%ymm0,%ymm0 - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpaddd %ymm11,%ymm0,%ymm9 - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - vmovdqu %ymm9,512(%rsp) - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -28(%r13),%ebx - xorl %eax,%edx - vpxor %ymm2,%ymm1,%ymm1 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpxor %ymm8,%ymm1,%ymm1 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - vpor %ymm8,%ymm1,%ymm1 - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpaddd %ymm11,%ymm1,%ymm9 - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - vmovdqu %ymm9,544(%rsp) - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl 8(%r13),%ecx - xorl %ebp,%esi - vpxor %ymm3,%ymm2,%ymm2 - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - vpxor %ymm8,%ymm2,%ymm2 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - vpor %ymm8,%ymm2,%ymm2 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpaddd %ymm11,%ymm2,%ymm9 - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - vmovdqu %ymm9,576(%rsp) - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl 44(%r13),%edx - xorl %ebx,%eax - vpxor %ymm4,%ymm3,%ymm3 - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm3,%ymm3 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl %r12d,%edx - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - vpor %ymm8,%ymm3,%ymm3 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpaddd %ymm11,%ymm3,%ymm9 - addl %r12d,%ecx - andl %edi,%edx - addl 68(%r13),%ebx - xorl %eax,%edx - vmovdqu %ymm9,608(%rsp) - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -96(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -60(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%r9),%r13 - leaq 128(%r9),%rdi - cmpq %r10,%r13 - cmovaeq %r9,%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - je L$done_avx2 - vmovdqu 64(%r11),%ymm6 - cmpq %r10,%rdi - ja L$ast_avx2 - - vmovdqu -64(%rdi),%xmm0 - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 $1,0(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - jmp L$ast_avx2 - -.p2align 5 -L$ast_avx2: - leaq 128+16(%rsp),%r13 - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - subq $-128,%r9 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vmovdqu -64(%r11),%ymm11 - vpshufb %ymm6,%ymm0,%ymm0 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpshufb %ymm6,%ymm1,%ymm1 - vpaddd %ymm11,%ymm0,%ymm8 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vmovdqu %ymm8,0(%rsp) - vpshufb %ymm6,%ymm2,%ymm2 - vpaddd %ymm11,%ymm1,%ymm9 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - vmovdqu %ymm9,32(%rsp) - vpshufb %ymm6,%ymm3,%ymm3 - vpaddd %ymm11,%ymm2,%ymm6 - addl -64(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - jmp L$align32_3 -.p2align 5 -L$align32_3: - vmovdqu %ymm6,64(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - addl -28(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vmovdqu %ymm7,96(%rsp) - addl 8(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm0,%ymm1,%ymm4 - addl 44(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - vpsrldq $4,%ymm3,%ymm8 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - xorl %ebp,%esi - addl %r12d,%edx - vpxor %ymm8,%ymm4,%ymm4 - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - vpsrld $31,%ymm4,%ymm8 - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - andl %edi,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - addl 68(%r13),%ebx - xorl %eax,%edx - vpxor %ymm10,%ymm4,%ymm4 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpaddd %ymm11,%ymm4,%ymm9 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vmovdqu %ymm9,128(%rsp) - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm1,%ymm2,%ymm5 - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrldq $4,%ymm4,%ymm8 - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r11),%ymm11 - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - xorl %eax,%edx - addl %r12d,%ecx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - vpxor %ymm10,%ymm5,%ymm5 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vmovdqu %ymm9,160(%rsp) - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm2,%ymm3,%ymm6 - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpsrldq $4,%ymm5,%ymm8 - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm8,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - vpsrld $31,%ymm6,%ymm8 - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - xorl %ebp,%esi - addl %r12d,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - xorl %ebx,%esi - addl -96(%r13),%ecx - vpxor %ymm10,%ymm6,%ymm6 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vmovdqu %ymm9,192(%rsp) - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpalignr $8,%ymm3,%ymm4,%ymm7 - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpsrldq $4,%ymm6,%ymm8 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm8,%ymm7,%ymm7 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - vpsrld $31,%ymm7,%ymm8 - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - xorl %ebx,%eax - addl %r12d,%esi - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - xorl %ecx,%eax - addl -60(%r13),%edx - vpxor %ymm10,%ymm7,%ymm7 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vmovdqu %ymm9,224(%rsp) - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%rsp),%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - jbe L$oop_avx2 - -L$done_avx2: - vzeroupper - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 - .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s index 77c24f1cf5d973..276322bec2b7e8 100644 --- a/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/sha/sha256-mb-x86_64.s @@ -9,8 +9,6 @@ _sha256_multi_block: movq _OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut - testl $268435456,%ecx - jnz _avx_shortcut movq %rsp,%rax pushq %rbx pushq %rbp @@ -2679,10 +2677,10 @@ L$oop_grande_shaext: punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 jmp L$oop_shaext .p2align 5 @@ -2714,11 +2712,11 @@ L$oop_shaext: movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 @@ -2736,11 +2734,11 @@ L$oop_shaext: .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2753,14 +2751,14 @@ L$oop_shaext: movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2777,13 +2775,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2799,13 +2797,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2821,13 +2819,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2843,13 +2841,13 @@ L$oop_shaext: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2865,13 +2863,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2887,13 +2885,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2909,13 +2907,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2931,13 +2929,13 @@ L$oop_shaext: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2953,13 +2951,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2975,13 +2973,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2997,13 +2995,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -3020,13 +3018,13 @@ L$oop_shaext: pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 @@ -3036,17 +3034,17 @@ L$oop_shaext: cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 - pshufd $0,%xmm7,%xmm9 + pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 - pshufd $85,%xmm7,%xmm10 + pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 @@ -3068,10 +3066,10 @@ L$oop_shaext: movl 280(%rsp),%edx - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 @@ -3107,4648 +3105,6 @@ L$done_shaext: L$epilogue_shaext: .byte 0xf3,0xc3 - -.p2align 5 -sha256_multi_block_avx: -_avx_shortcut: - shrq $32,%rcx - cmpl $2,%edx - jb L$avx - testl $32,%ecx - jnz _avx2_shortcut - jmp L$avx -.p2align 5 -L$avx: - movq %rsp,%rax - pushq %rbx - pushq %rbp - subq $288,%rsp - andq $-256,%rsp - movq %rax,272(%rsp) -L$body_avx: - leaq K256+128(%rip),%rbp - leaq 256(%rsp),%rbx - leaq 128(%rdi),%rdi - -L$oop_grande_avx: - movl %edx,280(%rsp) - xorl %edx,%edx - movq 0(%rsi),%r8 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r8 - movq 16(%rsi),%r9 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r9 - movq 32(%rsi),%r10 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r10 - movq 48(%rsi),%r11 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r11 - testl %edx,%edx - jz L$done_avx - - vmovdqu 0-128(%rdi),%xmm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%xmm9 - vmovdqu 64-128(%rdi),%xmm10 - vmovdqu 96-128(%rdi),%xmm11 - vmovdqu 128-128(%rdi),%xmm12 - vmovdqu 160-128(%rdi),%xmm13 - vmovdqu 192-128(%rdi),%xmm14 - vmovdqu 224-128(%rdi),%xmm15 - vmovdqu L$pbswap(%rip),%xmm6 - jmp L$oop_avx - -.p2align 5 -L$oop_avx: - vpxor %xmm9,%xmm10,%xmm4 - vmovd 0(%r8),%xmm5 - vmovd 0(%r9),%xmm0 - vpinsrd $1,0(%r10),%xmm5,%xmm5 - vpinsrd $1,0(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 4(%r8),%xmm5 - vmovd 4(%r9),%xmm0 - vpinsrd $1,4(%r10),%xmm5,%xmm5 - vpinsrd $1,4(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,16-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 8(%r8),%xmm5 - vmovd 8(%r9),%xmm0 - vpinsrd $1,8(%r10),%xmm5,%xmm5 - vpinsrd $1,8(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 12(%r8),%xmm5 - vmovd 12(%r9),%xmm0 - vpinsrd $1,12(%r10),%xmm5,%xmm5 - vpinsrd $1,12(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,48-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 16(%r8),%xmm5 - vmovd 16(%r9),%xmm0 - vpinsrd $1,16(%r10),%xmm5,%xmm5 - vpinsrd $1,16(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 20(%r8),%xmm5 - vmovd 20(%r9),%xmm0 - vpinsrd $1,20(%r10),%xmm5,%xmm5 - vpinsrd $1,20(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,80-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 24(%r8),%xmm5 - vmovd 24(%r9),%xmm0 - vpinsrd $1,24(%r10),%xmm5,%xmm5 - vpinsrd $1,24(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 28(%r8),%xmm5 - vmovd 28(%r9),%xmm0 - vpinsrd $1,28(%r10),%xmm5,%xmm5 - vpinsrd $1,28(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,112-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovd 32(%r8),%xmm5 - vmovd 32(%r9),%xmm0 - vpinsrd $1,32(%r10),%xmm5,%xmm5 - vpinsrd $1,32(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovd 36(%r8),%xmm5 - vmovd 36(%r9),%xmm0 - vpinsrd $1,36(%r10),%xmm5,%xmm5 - vpinsrd $1,36(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm5,144-128(%rax) - vpaddd %xmm14,%xmm5,%xmm5 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm5,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovd 40(%r8),%xmm5 - vmovd 40(%r9),%xmm0 - vpinsrd $1,40(%r10),%xmm5,%xmm5 - vpinsrd $1,40(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovd 44(%r8),%xmm5 - vmovd 44(%r9),%xmm0 - vpinsrd $1,44(%r10),%xmm5,%xmm5 - vpinsrd $1,44(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm5,176-128(%rax) - vpaddd %xmm12,%xmm5,%xmm5 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm5,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovd 48(%r8),%xmm5 - vmovd 48(%r9),%xmm0 - vpinsrd $1,48(%r10),%xmm5,%xmm5 - vpinsrd $1,48(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovd 52(%r8),%xmm5 - vmovd 52(%r9),%xmm0 - vpinsrd $1,52(%r10),%xmm5,%xmm5 - vpinsrd $1,52(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm5,208-128(%rax) - vpaddd %xmm10,%xmm5,%xmm5 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm5,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovd 56(%r8),%xmm5 - vmovd 56(%r9),%xmm0 - vpinsrd $1,56(%r10),%xmm5,%xmm5 - vpinsrd $1,56(%r11),%xmm0,%xmm0 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovd 60(%r8),%xmm5 - leaq 64(%r8),%r8 - vmovd 60(%r9),%xmm0 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r10),%xmm5,%xmm5 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r11),%xmm0,%xmm0 - leaq 64(%r11),%r11 - vpunpckldq %xmm0,%xmm5,%xmm5 - vpshufb %xmm6,%xmm5,%xmm5 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm5,240-128(%rax) - vpaddd %xmm8,%xmm5,%xmm5 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r8) - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - prefetcht0 63(%r9) - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - prefetcht0 63(%r10) - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - prefetcht0 63(%r11) - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm5,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%xmm5 - movl $3,%ecx - jmp L$oop_16_xx_avx -.p2align 5 -L$oop_16_xx_avx: - vmovdqu 16-128(%rax),%xmm6 - vpaddd 144-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 224-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,0-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 32-128(%rax),%xmm5 - vpaddd 160-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 240-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,16-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 48-128(%rax),%xmm6 - vpaddd 176-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 0-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,32-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 64-128(%rax),%xmm5 - vpaddd 192-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 16-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,48-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 80-128(%rax),%xmm6 - vpaddd 208-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 32-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,64-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 96-128(%rax),%xmm5 - vpaddd 224-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 48-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,80-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 112-128(%rax),%xmm6 - vpaddd 240-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 64-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,96-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 128-128(%rax),%xmm5 - vpaddd 0-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 80-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,112-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - vmovdqu 144-128(%rax),%xmm6 - vpaddd 16-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 96-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm12,%xmm7 - vpslld $26,%xmm12,%xmm2 - vmovdqu %xmm5,128-128(%rax) - vpaddd %xmm15,%xmm5,%xmm5 - - vpsrld $11,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm12,%xmm2 - vpaddd -128(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm12,%xmm2 - vpandn %xmm14,%xmm12,%xmm0 - vpand %xmm13,%xmm12,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm8,%xmm15 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm8,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm8,%xmm9,%xmm3 - - vpxor %xmm1,%xmm15,%xmm15 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm8,%xmm1 - - vpslld $19,%xmm8,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm15,%xmm7 - - vpsrld $22,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm8,%xmm2 - vpxor %xmm4,%xmm9,%xmm15 - vpaddd %xmm5,%xmm11,%xmm11 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm15,%xmm15 - vpaddd %xmm7,%xmm15,%xmm15 - vmovdqu 160-128(%rax),%xmm5 - vpaddd 32-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 112-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm11,%xmm7 - vpslld $26,%xmm11,%xmm2 - vmovdqu %xmm6,144-128(%rax) - vpaddd %xmm14,%xmm6,%xmm6 - - vpsrld $11,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm11,%xmm2 - vpaddd -96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm11,%xmm2 - vpandn %xmm13,%xmm11,%xmm0 - vpand %xmm12,%xmm11,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm15,%xmm14 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm15,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm15,%xmm8,%xmm4 - - vpxor %xmm1,%xmm14,%xmm14 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm15,%xmm1 - - vpslld $19,%xmm15,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm14,%xmm7 - - vpsrld $22,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm15,%xmm2 - vpxor %xmm3,%xmm8,%xmm14 - vpaddd %xmm6,%xmm10,%xmm10 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm14,%xmm14 - vpaddd %xmm7,%xmm14,%xmm14 - vmovdqu 176-128(%rax),%xmm6 - vpaddd 48-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 128-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm10,%xmm7 - vpslld $26,%xmm10,%xmm2 - vmovdqu %xmm5,160-128(%rax) - vpaddd %xmm13,%xmm5,%xmm5 - - vpsrld $11,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm10,%xmm2 - vpaddd -64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm10,%xmm2 - vpandn %xmm12,%xmm10,%xmm0 - vpand %xmm11,%xmm10,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm14,%xmm13 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm14,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm14,%xmm15,%xmm3 - - vpxor %xmm1,%xmm13,%xmm13 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm14,%xmm1 - - vpslld $19,%xmm14,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm13,%xmm7 - - vpsrld $22,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm14,%xmm2 - vpxor %xmm4,%xmm15,%xmm13 - vpaddd %xmm5,%xmm9,%xmm9 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm13,%xmm13 - vpaddd %xmm7,%xmm13,%xmm13 - vmovdqu 192-128(%rax),%xmm5 - vpaddd 64-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 144-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm9,%xmm7 - vpslld $26,%xmm9,%xmm2 - vmovdqu %xmm6,176-128(%rax) - vpaddd %xmm12,%xmm6,%xmm6 - - vpsrld $11,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm9,%xmm2 - vpaddd -32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm9,%xmm2 - vpandn %xmm11,%xmm9,%xmm0 - vpand %xmm10,%xmm9,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm13,%xmm12 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm13,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm13,%xmm14,%xmm4 - - vpxor %xmm1,%xmm12,%xmm12 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm13,%xmm1 - - vpslld $19,%xmm13,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm12,%xmm7 - - vpsrld $22,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm13,%xmm2 - vpxor %xmm3,%xmm14,%xmm12 - vpaddd %xmm6,%xmm8,%xmm8 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm12,%xmm12 - vpaddd %xmm7,%xmm12,%xmm12 - vmovdqu 208-128(%rax),%xmm6 - vpaddd 80-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 160-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm8,%xmm7 - vpslld $26,%xmm8,%xmm2 - vmovdqu %xmm5,192-128(%rax) - vpaddd %xmm11,%xmm5,%xmm5 - - vpsrld $11,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm8,%xmm2 - vpaddd 0(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm8,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm8,%xmm2 - vpandn %xmm10,%xmm8,%xmm0 - vpand %xmm9,%xmm8,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm12,%xmm11 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm12,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm12,%xmm13,%xmm3 - - vpxor %xmm1,%xmm11,%xmm11 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm12,%xmm1 - - vpslld $19,%xmm12,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm11,%xmm7 - - vpsrld $22,%xmm12,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm12,%xmm2 - vpxor %xmm4,%xmm13,%xmm11 - vpaddd %xmm5,%xmm15,%xmm15 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm11,%xmm11 - vpaddd %xmm7,%xmm11,%xmm11 - vmovdqu 224-128(%rax),%xmm5 - vpaddd 96-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 176-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm15,%xmm7 - vpslld $26,%xmm15,%xmm2 - vmovdqu %xmm6,208-128(%rax) - vpaddd %xmm10,%xmm6,%xmm6 - - vpsrld $11,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm15,%xmm2 - vpaddd 32(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm15,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm15,%xmm2 - vpandn %xmm9,%xmm15,%xmm0 - vpand %xmm8,%xmm15,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm11,%xmm10 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm11,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm11,%xmm12,%xmm4 - - vpxor %xmm1,%xmm10,%xmm10 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm11,%xmm1 - - vpslld $19,%xmm11,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm10,%xmm7 - - vpsrld $22,%xmm11,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm11,%xmm2 - vpxor %xmm3,%xmm12,%xmm10 - vpaddd %xmm6,%xmm14,%xmm14 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm10,%xmm10 - vpaddd %xmm7,%xmm10,%xmm10 - vmovdqu 240-128(%rax),%xmm6 - vpaddd 112-128(%rax),%xmm5,%xmm5 - - vpsrld $3,%xmm6,%xmm7 - vpsrld $7,%xmm6,%xmm1 - vpslld $25,%xmm6,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm6,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm6,%xmm2 - vmovdqu 192-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm5,%xmm5 - vpxor %xmm1,%xmm3,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm5,%xmm5 - vpsrld $6,%xmm14,%xmm7 - vpslld $26,%xmm14,%xmm2 - vmovdqu %xmm5,224-128(%rax) - vpaddd %xmm9,%xmm5,%xmm5 - - vpsrld $11,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm14,%xmm2 - vpaddd 64(%rbp),%xmm5,%xmm5 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm14,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm14,%xmm2 - vpandn %xmm8,%xmm14,%xmm0 - vpand %xmm15,%xmm14,%xmm3 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm10,%xmm9 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm10,%xmm1 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm10,%xmm11,%xmm3 - - vpxor %xmm1,%xmm9,%xmm9 - vpaddd %xmm7,%xmm5,%xmm5 - - vpsrld $13,%xmm10,%xmm1 - - vpslld $19,%xmm10,%xmm2 - vpaddd %xmm0,%xmm5,%xmm5 - vpand %xmm3,%xmm4,%xmm4 - - vpxor %xmm1,%xmm9,%xmm7 - - vpsrld $22,%xmm10,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm10,%xmm2 - vpxor %xmm4,%xmm11,%xmm9 - vpaddd %xmm5,%xmm13,%xmm13 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm5,%xmm9,%xmm9 - vpaddd %xmm7,%xmm9,%xmm9 - vmovdqu 0-128(%rax),%xmm5 - vpaddd 128-128(%rax),%xmm6,%xmm6 - - vpsrld $3,%xmm5,%xmm7 - vpsrld $7,%xmm5,%xmm1 - vpslld $25,%xmm5,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $18,%xmm5,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $14,%xmm5,%xmm2 - vmovdqu 208-128(%rax),%xmm0 - vpsrld $10,%xmm0,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - vpsrld $17,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $15,%xmm0,%xmm2 - vpaddd %xmm7,%xmm6,%xmm6 - vpxor %xmm1,%xmm4,%xmm7 - vpsrld $19,%xmm0,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $13,%xmm0,%xmm2 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - vpaddd %xmm7,%xmm6,%xmm6 - vpsrld $6,%xmm13,%xmm7 - vpslld $26,%xmm13,%xmm2 - vmovdqu %xmm6,240-128(%rax) - vpaddd %xmm8,%xmm6,%xmm6 - - vpsrld $11,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpslld $21,%xmm13,%xmm2 - vpaddd 96(%rbp),%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $25,%xmm13,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $7,%xmm13,%xmm2 - vpandn %xmm15,%xmm13,%xmm0 - vpand %xmm14,%xmm13,%xmm4 - - vpxor %xmm1,%xmm7,%xmm7 - - vpsrld $2,%xmm9,%xmm8 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $30,%xmm9,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm9,%xmm10,%xmm4 - - vpxor %xmm1,%xmm8,%xmm8 - vpaddd %xmm7,%xmm6,%xmm6 - - vpsrld $13,%xmm9,%xmm1 - - vpslld $19,%xmm9,%xmm2 - vpaddd %xmm0,%xmm6,%xmm6 - vpand %xmm4,%xmm3,%xmm3 - - vpxor %xmm1,%xmm8,%xmm7 - - vpsrld $22,%xmm9,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - - vpslld $10,%xmm9,%xmm2 - vpxor %xmm3,%xmm10,%xmm8 - vpaddd %xmm6,%xmm12,%xmm12 - - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm2,%xmm7,%xmm7 - - vpaddd %xmm6,%xmm8,%xmm8 - vpaddd %xmm7,%xmm8,%xmm8 - addq $256,%rbp - decl %ecx - jnz L$oop_16_xx_avx - - movl $1,%ecx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%xmm7 - vpxor %xmm0,%xmm0,%xmm0 - vmovdqa %xmm7,%xmm6 - vpcmpgtd %xmm0,%xmm6,%xmm6 - vpaddd %xmm6,%xmm7,%xmm7 - - vmovdqu 0-128(%rdi),%xmm0 - vpand %xmm6,%xmm8,%xmm8 - vmovdqu 32-128(%rdi),%xmm1 - vpand %xmm6,%xmm9,%xmm9 - vmovdqu 64-128(%rdi),%xmm2 - vpand %xmm6,%xmm10,%xmm10 - vmovdqu 96-128(%rdi),%xmm5 - vpand %xmm6,%xmm11,%xmm11 - vpaddd %xmm0,%xmm8,%xmm8 - vmovdqu 128-128(%rdi),%xmm0 - vpand %xmm6,%xmm12,%xmm12 - vpaddd %xmm1,%xmm9,%xmm9 - vmovdqu 160-128(%rdi),%xmm1 - vpand %xmm6,%xmm13,%xmm13 - vpaddd %xmm2,%xmm10,%xmm10 - vmovdqu 192-128(%rdi),%xmm2 - vpand %xmm6,%xmm14,%xmm14 - vpaddd %xmm5,%xmm11,%xmm11 - vmovdqu 224-128(%rdi),%xmm5 - vpand %xmm6,%xmm15,%xmm15 - vpaddd %xmm0,%xmm12,%xmm12 - vpaddd %xmm1,%xmm13,%xmm13 - vmovdqu %xmm8,0-128(%rdi) - vpaddd %xmm2,%xmm14,%xmm14 - vmovdqu %xmm9,32-128(%rdi) - vpaddd %xmm5,%xmm15,%xmm15 - vmovdqu %xmm10,64-128(%rdi) - vmovdqu %xmm11,96-128(%rdi) - vmovdqu %xmm12,128-128(%rdi) - vmovdqu %xmm13,160-128(%rdi) - vmovdqu %xmm14,192-128(%rdi) - vmovdqu %xmm15,224-128(%rdi) - - vmovdqu %xmm7,(%rbx) - vmovdqu L$pbswap(%rip),%xmm6 - decl %edx - jnz L$oop_avx - - movl 280(%rsp),%edx - leaq 16(%rdi),%rdi - leaq 64(%rsi),%rsi - decl %edx - jnz L$oop_grande_avx - -L$done_avx: - movq 272(%rsp),%rax - vzeroupper - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 5 -sha256_multi_block_avx2: -_avx2_shortcut: - movq %rsp,%rax - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $576,%rsp - andq $-256,%rsp - movq %rax,544(%rsp) -L$body_avx2: - leaq K256+128(%rip),%rbp - leaq 128(%rdi),%rdi - -L$oop_grande_avx2: - movl %edx,552(%rsp) - xorl %edx,%edx - leaq 512(%rsp),%rbx - movq 0(%rsi),%r12 - movl 8(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,0(%rbx) - cmovleq %rbp,%r12 - movq 16(%rsi),%r13 - movl 24(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,4(%rbx) - cmovleq %rbp,%r13 - movq 32(%rsi),%r14 - movl 40(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,8(%rbx) - cmovleq %rbp,%r14 - movq 48(%rsi),%r15 - movl 56(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,12(%rbx) - cmovleq %rbp,%r15 - movq 64(%rsi),%r8 - movl 72(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,16(%rbx) - cmovleq %rbp,%r8 - movq 80(%rsi),%r9 - movl 88(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,20(%rbx) - cmovleq %rbp,%r9 - movq 96(%rsi),%r10 - movl 104(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,24(%rbx) - cmovleq %rbp,%r10 - movq 112(%rsi),%r11 - movl 120(%rsi),%ecx - cmpl %edx,%ecx - cmovgl %ecx,%edx - testl %ecx,%ecx - movl %ecx,28(%rbx) - cmovleq %rbp,%r11 - vmovdqu 0-128(%rdi),%ymm8 - leaq 128(%rsp),%rax - vmovdqu 32-128(%rdi),%ymm9 - leaq 256+128(%rsp),%rbx - vmovdqu 64-128(%rdi),%ymm10 - vmovdqu 96-128(%rdi),%ymm11 - vmovdqu 128-128(%rdi),%ymm12 - vmovdqu 160-128(%rdi),%ymm13 - vmovdqu 192-128(%rdi),%ymm14 - vmovdqu 224-128(%rdi),%ymm15 - vmovdqu L$pbswap(%rip),%ymm6 - jmp L$oop_avx2 - -.p2align 5 -L$oop_avx2: - vpxor %ymm9,%ymm10,%ymm4 - vmovd 0(%r12),%xmm5 - vmovd 0(%r8),%xmm0 - vmovd 0(%r13),%xmm1 - vmovd 0(%r9),%xmm2 - vpinsrd $1,0(%r14),%xmm5,%xmm5 - vpinsrd $1,0(%r10),%xmm0,%xmm0 - vpinsrd $1,0(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,0(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 4(%r12),%xmm5 - vmovd 4(%r8),%xmm0 - vmovd 4(%r13),%xmm1 - vmovd 4(%r9),%xmm2 - vpinsrd $1,4(%r14),%xmm5,%xmm5 - vpinsrd $1,4(%r10),%xmm0,%xmm0 - vpinsrd $1,4(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,4(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,32-128(%rax) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 8(%r12),%xmm5 - vmovd 8(%r8),%xmm0 - vmovd 8(%r13),%xmm1 - vmovd 8(%r9),%xmm2 - vpinsrd $1,8(%r14),%xmm5,%xmm5 - vpinsrd $1,8(%r10),%xmm0,%xmm0 - vpinsrd $1,8(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,8(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 12(%r12),%xmm5 - vmovd 12(%r8),%xmm0 - vmovd 12(%r13),%xmm1 - vmovd 12(%r9),%xmm2 - vpinsrd $1,12(%r14),%xmm5,%xmm5 - vpinsrd $1,12(%r10),%xmm0,%xmm0 - vpinsrd $1,12(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,12(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,96-128(%rax) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 16(%r12),%xmm5 - vmovd 16(%r8),%xmm0 - vmovd 16(%r13),%xmm1 - vmovd 16(%r9),%xmm2 - vpinsrd $1,16(%r14),%xmm5,%xmm5 - vpinsrd $1,16(%r10),%xmm0,%xmm0 - vpinsrd $1,16(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,16(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 20(%r12),%xmm5 - vmovd 20(%r8),%xmm0 - vmovd 20(%r13),%xmm1 - vmovd 20(%r9),%xmm2 - vpinsrd $1,20(%r14),%xmm5,%xmm5 - vpinsrd $1,20(%r10),%xmm0,%xmm0 - vpinsrd $1,20(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,20(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,160-128(%rax) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 24(%r12),%xmm5 - vmovd 24(%r8),%xmm0 - vmovd 24(%r13),%xmm1 - vmovd 24(%r9),%xmm2 - vpinsrd $1,24(%r14),%xmm5,%xmm5 - vpinsrd $1,24(%r10),%xmm0,%xmm0 - vpinsrd $1,24(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,24(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 28(%r12),%xmm5 - vmovd 28(%r8),%xmm0 - vmovd 28(%r13),%xmm1 - vmovd 28(%r9),%xmm2 - vpinsrd $1,28(%r14),%xmm5,%xmm5 - vpinsrd $1,28(%r10),%xmm0,%xmm0 - vpinsrd $1,28(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,28(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,224-128(%rax) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovd 32(%r12),%xmm5 - vmovd 32(%r8),%xmm0 - vmovd 32(%r13),%xmm1 - vmovd 32(%r9),%xmm2 - vpinsrd $1,32(%r14),%xmm5,%xmm5 - vpinsrd $1,32(%r10),%xmm0,%xmm0 - vpinsrd $1,32(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,32(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovd 36(%r12),%xmm5 - vmovd 36(%r8),%xmm0 - vmovd 36(%r13),%xmm1 - vmovd 36(%r9),%xmm2 - vpinsrd $1,36(%r14),%xmm5,%xmm5 - vpinsrd $1,36(%r10),%xmm0,%xmm0 - vpinsrd $1,36(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,36(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm5,288-256-128(%rbx) - vpaddd %ymm14,%ymm5,%ymm5 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm5,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovd 40(%r12),%xmm5 - vmovd 40(%r8),%xmm0 - vmovd 40(%r13),%xmm1 - vmovd 40(%r9),%xmm2 - vpinsrd $1,40(%r14),%xmm5,%xmm5 - vpinsrd $1,40(%r10),%xmm0,%xmm0 - vpinsrd $1,40(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,40(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovd 44(%r12),%xmm5 - vmovd 44(%r8),%xmm0 - vmovd 44(%r13),%xmm1 - vmovd 44(%r9),%xmm2 - vpinsrd $1,44(%r14),%xmm5,%xmm5 - vpinsrd $1,44(%r10),%xmm0,%xmm0 - vpinsrd $1,44(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,44(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm5,352-256-128(%rbx) - vpaddd %ymm12,%ymm5,%ymm5 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm5,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovd 48(%r12),%xmm5 - vmovd 48(%r8),%xmm0 - vmovd 48(%r13),%xmm1 - vmovd 48(%r9),%xmm2 - vpinsrd $1,48(%r14),%xmm5,%xmm5 - vpinsrd $1,48(%r10),%xmm0,%xmm0 - vpinsrd $1,48(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,48(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovd 52(%r12),%xmm5 - vmovd 52(%r8),%xmm0 - vmovd 52(%r13),%xmm1 - vmovd 52(%r9),%xmm2 - vpinsrd $1,52(%r14),%xmm5,%xmm5 - vpinsrd $1,52(%r10),%xmm0,%xmm0 - vpinsrd $1,52(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,52(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm5,416-256-128(%rbx) - vpaddd %ymm10,%ymm5,%ymm5 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm5,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovd 56(%r12),%xmm5 - vmovd 56(%r8),%xmm0 - vmovd 56(%r13),%xmm1 - vmovd 56(%r9),%xmm2 - vpinsrd $1,56(%r14),%xmm5,%xmm5 - vpinsrd $1,56(%r10),%xmm0,%xmm0 - vpinsrd $1,56(%r15),%xmm1,%xmm1 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,56(%r11),%xmm2,%xmm2 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovd 60(%r12),%xmm5 - leaq 64(%r12),%r12 - vmovd 60(%r8),%xmm0 - leaq 64(%r8),%r8 - vmovd 60(%r13),%xmm1 - leaq 64(%r13),%r13 - vmovd 60(%r9),%xmm2 - leaq 64(%r9),%r9 - vpinsrd $1,60(%r14),%xmm5,%xmm5 - leaq 64(%r14),%r14 - vpinsrd $1,60(%r10),%xmm0,%xmm0 - leaq 64(%r10),%r10 - vpinsrd $1,60(%r15),%xmm1,%xmm1 - leaq 64(%r15),%r15 - vpunpckldq %ymm1,%ymm5,%ymm5 - vpinsrd $1,60(%r11),%xmm2,%xmm2 - leaq 64(%r11),%r11 - vpunpckldq %ymm2,%ymm0,%ymm0 - vinserti128 $1,%xmm0,%ymm5,%ymm5 - vpshufb %ymm6,%ymm5,%ymm5 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm5,480-256-128(%rbx) - vpaddd %ymm8,%ymm5,%ymm5 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r12) - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - prefetcht0 63(%r13) - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r14) - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - prefetcht0 63(%r15) - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm9,%ymm1 - prefetcht0 63(%r8) - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm4,%ymm3,%ymm3 - prefetcht0 63(%r9) - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - prefetcht0 63(%r10) - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm5,%ymm12,%ymm12 - prefetcht0 63(%r11) - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 0-128(%rax),%ymm5 - movl $3,%ecx - jmp L$oop_16_xx_avx2 -.p2align 5 -L$oop_16_xx_avx2: - vmovdqu 32-128(%rax),%ymm6 - vpaddd 288-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 448-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,0-128(%rax) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 64-128(%rax),%ymm5 - vpaddd 320-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 480-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,32-128(%rax) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 96-128(%rax),%ymm6 - vpaddd 352-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 0-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,64-128(%rax) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 128-128(%rax),%ymm5 - vpaddd 384-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 32-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,96-128(%rax) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 160-128(%rax),%ymm6 - vpaddd 416-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 64-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,128-128(%rax) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 192-128(%rax),%ymm5 - vpaddd 448-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 96-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,160-128(%rax) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 224-128(%rax),%ymm6 - vpaddd 480-256-128(%rbx),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 128-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,192-128(%rax) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 256-256-128(%rbx),%ymm5 - vpaddd 0-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 160-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,224-128(%rax) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - vmovdqu 288-256-128(%rbx),%ymm6 - vpaddd 32-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 192-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm12,%ymm7 - vpslld $26,%ymm12,%ymm2 - vmovdqu %ymm5,256-256-128(%rbx) - vpaddd %ymm15,%ymm5,%ymm5 - - vpsrld $11,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm12,%ymm2 - vpaddd -128(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm12,%ymm2 - vpandn %ymm14,%ymm12,%ymm0 - vpand %ymm13,%ymm12,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm8,%ymm15 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm8,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm8,%ymm9,%ymm3 - - vpxor %ymm1,%ymm15,%ymm15 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm8,%ymm1 - - vpslld $19,%ymm8,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm15,%ymm7 - - vpsrld $22,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm8,%ymm2 - vpxor %ymm4,%ymm9,%ymm15 - vpaddd %ymm5,%ymm11,%ymm11 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm15,%ymm15 - vpaddd %ymm7,%ymm15,%ymm15 - vmovdqu 320-256-128(%rbx),%ymm5 - vpaddd 64-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 224-128(%rax),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm11,%ymm7 - vpslld $26,%ymm11,%ymm2 - vmovdqu %ymm6,288-256-128(%rbx) - vpaddd %ymm14,%ymm6,%ymm6 - - vpsrld $11,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm11,%ymm2 - vpaddd -96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm11,%ymm2 - vpandn %ymm13,%ymm11,%ymm0 - vpand %ymm12,%ymm11,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm15,%ymm14 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm15,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm15,%ymm8,%ymm4 - - vpxor %ymm1,%ymm14,%ymm14 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm15,%ymm1 - - vpslld $19,%ymm15,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm14,%ymm7 - - vpsrld $22,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm15,%ymm2 - vpxor %ymm3,%ymm8,%ymm14 - vpaddd %ymm6,%ymm10,%ymm10 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm14,%ymm14 - vpaddd %ymm7,%ymm14,%ymm14 - vmovdqu 352-256-128(%rbx),%ymm6 - vpaddd 96-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 256-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm10,%ymm7 - vpslld $26,%ymm10,%ymm2 - vmovdqu %ymm5,320-256-128(%rbx) - vpaddd %ymm13,%ymm5,%ymm5 - - vpsrld $11,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm10,%ymm2 - vpaddd -64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm10,%ymm2 - vpandn %ymm12,%ymm10,%ymm0 - vpand %ymm11,%ymm10,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm14,%ymm13 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm14,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm14,%ymm15,%ymm3 - - vpxor %ymm1,%ymm13,%ymm13 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm14,%ymm1 - - vpslld $19,%ymm14,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm13,%ymm7 - - vpsrld $22,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm14,%ymm2 - vpxor %ymm4,%ymm15,%ymm13 - vpaddd %ymm5,%ymm9,%ymm9 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm13,%ymm13 - vpaddd %ymm7,%ymm13,%ymm13 - vmovdqu 384-256-128(%rbx),%ymm5 - vpaddd 128-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 288-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm9,%ymm7 - vpslld $26,%ymm9,%ymm2 - vmovdqu %ymm6,352-256-128(%rbx) - vpaddd %ymm12,%ymm6,%ymm6 - - vpsrld $11,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm9,%ymm2 - vpaddd -32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm9,%ymm2 - vpandn %ymm11,%ymm9,%ymm0 - vpand %ymm10,%ymm9,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm13,%ymm12 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm13,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm13,%ymm14,%ymm4 - - vpxor %ymm1,%ymm12,%ymm12 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm13,%ymm1 - - vpslld $19,%ymm13,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm12,%ymm7 - - vpsrld $22,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm13,%ymm2 - vpxor %ymm3,%ymm14,%ymm12 - vpaddd %ymm6,%ymm8,%ymm8 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm12,%ymm12 - vpaddd %ymm7,%ymm12,%ymm12 - vmovdqu 416-256-128(%rbx),%ymm6 - vpaddd 160-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 320-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm8,%ymm7 - vpslld $26,%ymm8,%ymm2 - vmovdqu %ymm5,384-256-128(%rbx) - vpaddd %ymm11,%ymm5,%ymm5 - - vpsrld $11,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm8,%ymm2 - vpaddd 0(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm8,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm8,%ymm2 - vpandn %ymm10,%ymm8,%ymm0 - vpand %ymm9,%ymm8,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm12,%ymm11 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm12,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm12,%ymm13,%ymm3 - - vpxor %ymm1,%ymm11,%ymm11 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm12,%ymm1 - - vpslld $19,%ymm12,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm11,%ymm7 - - vpsrld $22,%ymm12,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm12,%ymm2 - vpxor %ymm4,%ymm13,%ymm11 - vpaddd %ymm5,%ymm15,%ymm15 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm11,%ymm11 - vpaddd %ymm7,%ymm11,%ymm11 - vmovdqu 448-256-128(%rbx),%ymm5 - vpaddd 192-128(%rax),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 352-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm15,%ymm7 - vpslld $26,%ymm15,%ymm2 - vmovdqu %ymm6,416-256-128(%rbx) - vpaddd %ymm10,%ymm6,%ymm6 - - vpsrld $11,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm15,%ymm2 - vpaddd 32(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm15,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm15,%ymm2 - vpandn %ymm9,%ymm15,%ymm0 - vpand %ymm8,%ymm15,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm11,%ymm10 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm11,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm11,%ymm12,%ymm4 - - vpxor %ymm1,%ymm10,%ymm10 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm11,%ymm1 - - vpslld $19,%ymm11,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm10,%ymm7 - - vpsrld $22,%ymm11,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm11,%ymm2 - vpxor %ymm3,%ymm12,%ymm10 - vpaddd %ymm6,%ymm14,%ymm14 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm10,%ymm10 - vpaddd %ymm7,%ymm10,%ymm10 - vmovdqu 480-256-128(%rbx),%ymm6 - vpaddd 224-128(%rax),%ymm5,%ymm5 - - vpsrld $3,%ymm6,%ymm7 - vpsrld $7,%ymm6,%ymm1 - vpslld $25,%ymm6,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm6,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm6,%ymm2 - vmovdqu 384-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm5,%ymm5 - vpxor %ymm1,%ymm3,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm5,%ymm5 - vpsrld $6,%ymm14,%ymm7 - vpslld $26,%ymm14,%ymm2 - vmovdqu %ymm5,448-256-128(%rbx) - vpaddd %ymm9,%ymm5,%ymm5 - - vpsrld $11,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm14,%ymm2 - vpaddd 64(%rbp),%ymm5,%ymm5 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm14,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm14,%ymm2 - vpandn %ymm8,%ymm14,%ymm0 - vpand %ymm15,%ymm14,%ymm3 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm10,%ymm9 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm10,%ymm1 - vpxor %ymm3,%ymm0,%ymm0 - vpxor %ymm10,%ymm11,%ymm3 - - vpxor %ymm1,%ymm9,%ymm9 - vpaddd %ymm7,%ymm5,%ymm5 - - vpsrld $13,%ymm10,%ymm1 - - vpslld $19,%ymm10,%ymm2 - vpaddd %ymm0,%ymm5,%ymm5 - vpand %ymm3,%ymm4,%ymm4 - - vpxor %ymm1,%ymm9,%ymm7 - - vpsrld $22,%ymm10,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm10,%ymm2 - vpxor %ymm4,%ymm11,%ymm9 - vpaddd %ymm5,%ymm13,%ymm13 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm5,%ymm9,%ymm9 - vpaddd %ymm7,%ymm9,%ymm9 - vmovdqu 0-128(%rax),%ymm5 - vpaddd 256-256-128(%rbx),%ymm6,%ymm6 - - vpsrld $3,%ymm5,%ymm7 - vpsrld $7,%ymm5,%ymm1 - vpslld $25,%ymm5,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $18,%ymm5,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $14,%ymm5,%ymm2 - vmovdqu 416-256-128(%rbx),%ymm0 - vpsrld $10,%ymm0,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - vpsrld $17,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $15,%ymm0,%ymm2 - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm1,%ymm4,%ymm7 - vpsrld $19,%ymm0,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $13,%ymm0,%ymm2 - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - vpaddd %ymm7,%ymm6,%ymm6 - vpsrld $6,%ymm13,%ymm7 - vpslld $26,%ymm13,%ymm2 - vmovdqu %ymm6,480-256-128(%rbx) - vpaddd %ymm8,%ymm6,%ymm6 - - vpsrld $11,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - vpslld $21,%ymm13,%ymm2 - vpaddd 96(%rbp),%ymm6,%ymm6 - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $25,%ymm13,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $7,%ymm13,%ymm2 - vpandn %ymm15,%ymm13,%ymm0 - vpand %ymm14,%ymm13,%ymm4 - - vpxor %ymm1,%ymm7,%ymm7 - - vpsrld $2,%ymm9,%ymm8 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $30,%ymm9,%ymm1 - vpxor %ymm4,%ymm0,%ymm0 - vpxor %ymm9,%ymm10,%ymm4 - - vpxor %ymm1,%ymm8,%ymm8 - vpaddd %ymm7,%ymm6,%ymm6 - - vpsrld $13,%ymm9,%ymm1 - - vpslld $19,%ymm9,%ymm2 - vpaddd %ymm0,%ymm6,%ymm6 - vpand %ymm4,%ymm3,%ymm3 - - vpxor %ymm1,%ymm8,%ymm7 - - vpsrld $22,%ymm9,%ymm1 - vpxor %ymm2,%ymm7,%ymm7 - - vpslld $10,%ymm9,%ymm2 - vpxor %ymm3,%ymm10,%ymm8 - vpaddd %ymm6,%ymm12,%ymm12 - - vpxor %ymm1,%ymm7,%ymm7 - vpxor %ymm2,%ymm7,%ymm7 - - vpaddd %ymm6,%ymm8,%ymm8 - vpaddd %ymm7,%ymm8,%ymm8 - addq $256,%rbp - decl %ecx - jnz L$oop_16_xx_avx2 - - movl $1,%ecx - leaq 512(%rsp),%rbx - leaq K256+128(%rip),%rbp - cmpl 0(%rbx),%ecx - cmovgeq %rbp,%r12 - cmpl 4(%rbx),%ecx - cmovgeq %rbp,%r13 - cmpl 8(%rbx),%ecx - cmovgeq %rbp,%r14 - cmpl 12(%rbx),%ecx - cmovgeq %rbp,%r15 - cmpl 16(%rbx),%ecx - cmovgeq %rbp,%r8 - cmpl 20(%rbx),%ecx - cmovgeq %rbp,%r9 - cmpl 24(%rbx),%ecx - cmovgeq %rbp,%r10 - cmpl 28(%rbx),%ecx - cmovgeq %rbp,%r11 - vmovdqa (%rbx),%ymm7 - vpxor %ymm0,%ymm0,%ymm0 - vmovdqa %ymm7,%ymm6 - vpcmpgtd %ymm0,%ymm6,%ymm6 - vpaddd %ymm6,%ymm7,%ymm7 - - vmovdqu 0-128(%rdi),%ymm0 - vpand %ymm6,%ymm8,%ymm8 - vmovdqu 32-128(%rdi),%ymm1 - vpand %ymm6,%ymm9,%ymm9 - vmovdqu 64-128(%rdi),%ymm2 - vpand %ymm6,%ymm10,%ymm10 - vmovdqu 96-128(%rdi),%ymm5 - vpand %ymm6,%ymm11,%ymm11 - vpaddd %ymm0,%ymm8,%ymm8 - vmovdqu 128-128(%rdi),%ymm0 - vpand %ymm6,%ymm12,%ymm12 - vpaddd %ymm1,%ymm9,%ymm9 - vmovdqu 160-128(%rdi),%ymm1 - vpand %ymm6,%ymm13,%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vmovdqu 192-128(%rdi),%ymm2 - vpand %ymm6,%ymm14,%ymm14 - vpaddd %ymm5,%ymm11,%ymm11 - vmovdqu 224-128(%rdi),%ymm5 - vpand %ymm6,%ymm15,%ymm15 - vpaddd %ymm0,%ymm12,%ymm12 - vpaddd %ymm1,%ymm13,%ymm13 - vmovdqu %ymm8,0-128(%rdi) - vpaddd %ymm2,%ymm14,%ymm14 - vmovdqu %ymm9,32-128(%rdi) - vpaddd %ymm5,%ymm15,%ymm15 - vmovdqu %ymm10,64-128(%rdi) - vmovdqu %ymm11,96-128(%rdi) - vmovdqu %ymm12,128-128(%rdi) - vmovdqu %ymm13,160-128(%rdi) - vmovdqu %ymm14,192-128(%rdi) - vmovdqu %ymm15,224-128(%rdi) - - vmovdqu %ymm7,(%rbx) - leaq 256+128(%rsp),%rbx - vmovdqu L$pbswap(%rip),%ymm6 - decl %edx - jnz L$oop_avx2 - - - - - - - -L$done_avx2: - movq 544(%rsp),%rax - vzeroupper - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 - .p2align 8 K256: .long 1116352408,1116352408,1116352408,1116352408 diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s index b66bd34406cc74..5566d587610ea7 100644 --- a/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/sha/sha256-x86_64.s @@ -11,14 +11,6 @@ _sha256_block_data_order: movl 8(%r11),%r11d testl $536870912,%r11d jnz _shaext_shortcut - andl $296,%r11d - cmpl $296,%r11d - je L$avx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut pushq %rbx @@ -1762,9 +1754,9 @@ _shaext_shortcut: movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 @@ -1783,7 +1775,7 @@ L$oop_shaext: .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 @@ -1792,7 +1784,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 @@ -1801,7 +1793,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1813,7 +1805,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1824,7 +1816,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1835,7 +1827,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1846,7 +1838,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1857,7 +1849,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1868,7 +1860,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1879,7 +1871,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1890,7 +1882,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1901,7 +1893,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1912,7 +1904,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1923,7 +1915,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 @@ -1932,7 +1924,7 @@ L$oop_shaext: movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 @@ -1941,7 +1933,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 nop .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 @@ -1950,9 +1942,9 @@ L$oop_shaext: paddd %xmm9,%xmm1 jnz L$oop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 @@ -3054,2304 +3046,3 @@ L$ssse3_00_47: leaq 48(%rsi),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 - - -.p2align 6 -sha256_block_data_order_avx: -L$avx_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -L$prologue_avx: - - vzeroupper - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%xmm8 - vmovdqa K256+512+64(%rip),%xmm9 - jmp L$loop_avx -.p2align 4 -L$loop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%edi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%edi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$avx_00_47 - -.p2align 4 -L$avx_00_47: - subq $-128,%rbp - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm3,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm0,%xmm0 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpshufd $80,%xmm0,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm0,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm1,%xmm1 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpshufd $80,%xmm1,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm1,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm2,%xmm2 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpshufd $80,%xmm2,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm2,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm3,%xmm3 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpshufd $80,%xmm3,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne L$avx_00_47 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop_avx - - movq 64+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 6 -sha256_block_data_order_avx2: -L$avx2_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $544,%rsp - shlq $4,%rdx - andq $-1024,%rsp - leaq (%rsi,%rdx,4),%rdx - addq $448,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) -L$prologue_avx2: - - vzeroupper - subq $-64,%rsi - movl 0(%rdi),%eax - movq %rsi,%r12 - movl 4(%rdi),%ebx - cmpq %rdx,%rsi - movl 8(%rdi),%ecx - cmoveq %rsp,%r12 - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%ymm8 - vmovdqa K256+512+64(%rip),%ymm9 - jmp L$oop_avx2 -.p2align 4 -L$oop_avx2: - vmovdqa K256+512(%rip),%ymm7 - vmovdqu -64+0(%rsi),%xmm0 - vmovdqu -64+16(%rsi),%xmm1 - vmovdqu -64+32(%rsi),%xmm2 - vmovdqu -64+48(%rsi),%xmm3 - - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm7,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm7,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - - leaq K256(%rip),%rbp - vpshufb %ymm7,%ymm2,%ymm2 - vpaddd 0(%rbp),%ymm0,%ymm4 - vpshufb %ymm7,%ymm3,%ymm3 - vpaddd 32(%rbp),%ymm1,%ymm5 - vpaddd 64(%rbp),%ymm2,%ymm6 - vpaddd 96(%rbp),%ymm3,%ymm7 - vmovdqa %ymm4,0(%rsp) - xorl %r14d,%r14d - vmovdqa %ymm5,32(%rsp) - leaq -64(%rsp),%rsp - movl %ebx,%edi - vmovdqa %ymm6,0(%rsp) - xorl %ecx,%edi - vmovdqa %ymm7,32(%rsp) - movl %r9d,%r12d - subq $-32*4,%rbp - jmp L$avx2_00_47 - -.p2align 4 -L$avx2_00_47: - leaq -64(%rsp),%rsp - vpalignr $4,%ymm0,%ymm1,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm2,%ymm3,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm0,%ymm0 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm3,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm0,%ymm0 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm0,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm0,%ymm0 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 0(%rbp),%ymm0,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm1,%ymm2,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm3,%ymm0,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm1,%ymm1 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm0,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm1,%ymm1 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm1,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm1,%ymm1 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 32(%rbp),%ymm1,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq -64(%rsp),%rsp - vpalignr $4,%ymm2,%ymm3,%ymm4 - addl 0+128(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - vpalignr $4,%ymm0,%ymm1,%ymm7 - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - vpsrld $7,%ymm4,%ymm6 - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - vpaddd %ymm7,%ymm2,%ymm2 - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - vpshufd $250,%ymm1,%ymm7 - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 4+128(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - vpslld $11,%ymm5,%ymm5 - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - vpaddd %ymm4,%ymm2,%ymm2 - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 8+128(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - vpxor %ymm7,%ymm6,%ymm6 - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - vpshufd $80,%ymm2,%ymm7 - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 12+128(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - vpxor %ymm7,%ymm6,%ymm6 - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - vpaddd %ymm6,%ymm2,%ymm2 - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - vpaddd 64(%rbp),%ymm2,%ymm6 - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - vmovdqa %ymm6,0(%rsp) - vpalignr $4,%ymm3,%ymm0,%ymm4 - addl 32+128(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - vpalignr $4,%ymm1,%ymm2,%ymm7 - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - vpsrld $7,%ymm4,%ymm6 - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - vpaddd %ymm7,%ymm3,%ymm3 - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - vpsrld $3,%ymm4,%ymm7 - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - vpslld $14,%ymm4,%ymm5 - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - vpxor %ymm6,%ymm7,%ymm4 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - vpshufd $250,%ymm2,%ymm7 - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - vpsrld $11,%ymm6,%ymm6 - addl 36+128(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - vpxor %ymm5,%ymm4,%ymm4 - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - vpslld $11,%ymm5,%ymm5 - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - vpxor %ymm6,%ymm4,%ymm4 - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - vpsrld $10,%ymm7,%ymm6 - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - vpxor %ymm5,%ymm4,%ymm4 - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - vpsrlq $17,%ymm7,%ymm7 - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - vpaddd %ymm4,%ymm3,%ymm3 - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 40+128(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - vpxor %ymm7,%ymm6,%ymm6 - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - vpshufb %ymm8,%ymm6,%ymm6 - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - vpshufd $80,%ymm3,%ymm7 - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - vpsrld $10,%ymm7,%ymm6 - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - vpsrlq $17,%ymm7,%ymm7 - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - vpxor %ymm7,%ymm6,%ymm6 - addl 44+128(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - vpsrlq $2,%ymm7,%ymm7 - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - vpxor %ymm7,%ymm6,%ymm6 - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - vpshufb %ymm9,%ymm6,%ymm6 - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - vpaddd %ymm6,%ymm3,%ymm3 - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - vpaddd 96(%rbp),%ymm3,%ymm6 - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - vmovdqa %ymm6,32(%rsp) - leaq 128(%rbp),%rbp - cmpb $0,3(%rbp) - jne L$avx2_00_47 - addl 0+64(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+64(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+64(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+64(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+64(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+64(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+64(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+64(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - addl 0(%rsp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4(%rsp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8(%rsp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12(%rsp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32(%rsp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36(%rsp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40(%rsp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44(%rsp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rbp - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - cmpq 80(%rbp),%rsi - je L$done_avx2 - - xorl %r14d,%r14d - movl %ebx,%edi - xorl %ecx,%edi - movl %r9d,%r12d - jmp L$ower_avx2 -.p2align 4 -L$ower_avx2: - addl 0+16(%rbp),%r11d - andl %r8d,%r12d - rorxl $25,%r8d,%r13d - rorxl $11,%r8d,%r15d - leal (%rax,%r14,1),%eax - leal (%r11,%r12,1),%r11d - andnl %r10d,%r8d,%r12d - xorl %r15d,%r13d - rorxl $6,%r8d,%r14d - leal (%r11,%r12,1),%r11d - xorl %r14d,%r13d - movl %eax,%r15d - rorxl $22,%eax,%r12d - leal (%r11,%r13,1),%r11d - xorl %ebx,%r15d - rorxl $13,%eax,%r14d - rorxl $2,%eax,%r13d - leal (%rdx,%r11,1),%edx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %ebx,%edi - xorl %r13d,%r14d - leal (%r11,%rdi,1),%r11d - movl %r8d,%r12d - addl 4+16(%rbp),%r10d - andl %edx,%r12d - rorxl $25,%edx,%r13d - rorxl $11,%edx,%edi - leal (%r11,%r14,1),%r11d - leal (%r10,%r12,1),%r10d - andnl %r9d,%edx,%r12d - xorl %edi,%r13d - rorxl $6,%edx,%r14d - leal (%r10,%r12,1),%r10d - xorl %r14d,%r13d - movl %r11d,%edi - rorxl $22,%r11d,%r12d - leal (%r10,%r13,1),%r10d - xorl %eax,%edi - rorxl $13,%r11d,%r14d - rorxl $2,%r11d,%r13d - leal (%rcx,%r10,1),%ecx - andl %edi,%r15d - xorl %r12d,%r14d - xorl %eax,%r15d - xorl %r13d,%r14d - leal (%r10,%r15,1),%r10d - movl %edx,%r12d - addl 8+16(%rbp),%r9d - andl %ecx,%r12d - rorxl $25,%ecx,%r13d - rorxl $11,%ecx,%r15d - leal (%r10,%r14,1),%r10d - leal (%r9,%r12,1),%r9d - andnl %r8d,%ecx,%r12d - xorl %r15d,%r13d - rorxl $6,%ecx,%r14d - leal (%r9,%r12,1),%r9d - xorl %r14d,%r13d - movl %r10d,%r15d - rorxl $22,%r10d,%r12d - leal (%r9,%r13,1),%r9d - xorl %r11d,%r15d - rorxl $13,%r10d,%r14d - rorxl $2,%r10d,%r13d - leal (%rbx,%r9,1),%ebx - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r11d,%edi - xorl %r13d,%r14d - leal (%r9,%rdi,1),%r9d - movl %ecx,%r12d - addl 12+16(%rbp),%r8d - andl %ebx,%r12d - rorxl $25,%ebx,%r13d - rorxl $11,%ebx,%edi - leal (%r9,%r14,1),%r9d - leal (%r8,%r12,1),%r8d - andnl %edx,%ebx,%r12d - xorl %edi,%r13d - rorxl $6,%ebx,%r14d - leal (%r8,%r12,1),%r8d - xorl %r14d,%r13d - movl %r9d,%edi - rorxl $22,%r9d,%r12d - leal (%r8,%r13,1),%r8d - xorl %r10d,%edi - rorxl $13,%r9d,%r14d - rorxl $2,%r9d,%r13d - leal (%rax,%r8,1),%eax - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r10d,%r15d - xorl %r13d,%r14d - leal (%r8,%r15,1),%r8d - movl %ebx,%r12d - addl 32+16(%rbp),%edx - andl %eax,%r12d - rorxl $25,%eax,%r13d - rorxl $11,%eax,%r15d - leal (%r8,%r14,1),%r8d - leal (%rdx,%r12,1),%edx - andnl %ecx,%eax,%r12d - xorl %r15d,%r13d - rorxl $6,%eax,%r14d - leal (%rdx,%r12,1),%edx - xorl %r14d,%r13d - movl %r8d,%r15d - rorxl $22,%r8d,%r12d - leal (%rdx,%r13,1),%edx - xorl %r9d,%r15d - rorxl $13,%r8d,%r14d - rorxl $2,%r8d,%r13d - leal (%r11,%rdx,1),%r11d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %r9d,%edi - xorl %r13d,%r14d - leal (%rdx,%rdi,1),%edx - movl %eax,%r12d - addl 36+16(%rbp),%ecx - andl %r11d,%r12d - rorxl $25,%r11d,%r13d - rorxl $11,%r11d,%edi - leal (%rdx,%r14,1),%edx - leal (%rcx,%r12,1),%ecx - andnl %ebx,%r11d,%r12d - xorl %edi,%r13d - rorxl $6,%r11d,%r14d - leal (%rcx,%r12,1),%ecx - xorl %r14d,%r13d - movl %edx,%edi - rorxl $22,%edx,%r12d - leal (%rcx,%r13,1),%ecx - xorl %r8d,%edi - rorxl $13,%edx,%r14d - rorxl $2,%edx,%r13d - leal (%r10,%rcx,1),%r10d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %r8d,%r15d - xorl %r13d,%r14d - leal (%rcx,%r15,1),%ecx - movl %r11d,%r12d - addl 40+16(%rbp),%ebx - andl %r10d,%r12d - rorxl $25,%r10d,%r13d - rorxl $11,%r10d,%r15d - leal (%rcx,%r14,1),%ecx - leal (%rbx,%r12,1),%ebx - andnl %eax,%r10d,%r12d - xorl %r15d,%r13d - rorxl $6,%r10d,%r14d - leal (%rbx,%r12,1),%ebx - xorl %r14d,%r13d - movl %ecx,%r15d - rorxl $22,%ecx,%r12d - leal (%rbx,%r13,1),%ebx - xorl %edx,%r15d - rorxl $13,%ecx,%r14d - rorxl $2,%ecx,%r13d - leal (%r9,%rbx,1),%r9d - andl %r15d,%edi - xorl %r12d,%r14d - xorl %edx,%edi - xorl %r13d,%r14d - leal (%rbx,%rdi,1),%ebx - movl %r10d,%r12d - addl 44+16(%rbp),%eax - andl %r9d,%r12d - rorxl $25,%r9d,%r13d - rorxl $11,%r9d,%edi - leal (%rbx,%r14,1),%ebx - leal (%rax,%r12,1),%eax - andnl %r11d,%r9d,%r12d - xorl %edi,%r13d - rorxl $6,%r9d,%r14d - leal (%rax,%r12,1),%eax - xorl %r14d,%r13d - movl %ebx,%edi - rorxl $22,%ebx,%r12d - leal (%rax,%r13,1),%eax - xorl %ecx,%edi - rorxl $13,%ebx,%r14d - rorxl $2,%ebx,%r13d - leal (%r8,%rax,1),%r8d - andl %edi,%r15d - xorl %r12d,%r14d - xorl %ecx,%r15d - xorl %r13d,%r14d - leal (%rax,%r15,1),%eax - movl %r9d,%r12d - leaq -64(%rbp),%rbp - cmpq %rsp,%rbp - jae L$ower_avx2 - - movq 512(%rsp),%rdi - addl %r14d,%eax - - leaq 448(%rsp),%rsp - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - leaq 128(%rsi),%rsi - addl 24(%rdi),%r10d - movq %rsi,%r12 - addl 28(%rdi),%r11d - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - cmoveq %rsp,%r12 - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - - jbe L$oop_avx2 - leaq (%rsp),%rbp - -L$done_avx2: - leaq (%rbp),%rsp - movq 64+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s b/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s index 91821da1264e86..1cae4e11fb33db 100644 --- a/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s +++ b/deps/openssl/asm/x64-macosx-gas/sha/sha512-x86_64.s @@ -5,20 +5,6 @@ .p2align 4 _sha512_block_data_order: - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $2048,%r10d - jnz L$xop_shortcut - andl $296,%r11d - cmpl $296,%r11d - je L$avx2_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1795,3570 +1781,3 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - -.p2align 6 -sha512_block_data_order_xop: -L$xop_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -L$prologue_xop: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_xop -.p2align 4 -L$loop_xop: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$xop_00_47 - -.p2align 4 -L$xop_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm0,%xmm0 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,223,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm7,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm0,%xmm0 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm1,%xmm1 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,216,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm0,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm1,%xmm1 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm2,%xmm2 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,217,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm1,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm2,%xmm2 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm3,%xmm3 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,218,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm2,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm3,%xmm3 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - rorq $23,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r8,%r13 - xorq %r10,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rax,%r14 - vpaddq %xmm11,%xmm4,%xmm4 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 -.byte 143,72,120,195,209,7 - xorq %r10,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,219,3 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm3,%xmm10 - addq %r11,%rdx - addq %rdi,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %rdx,%r13 - addq %r11,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r11 - vpxor %xmm10,%xmm11,%xmm11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - vpaddq %xmm11,%xmm4,%xmm4 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - rorq $23,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rcx,%r13 - xorq %r8,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r10,%r14 - vpaddq %xmm11,%xmm5,%xmm5 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 -.byte 143,72,120,195,209,7 - xorq %r8,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,220,3 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm4,%xmm10 - addq %r9,%rbx - addq %rdi,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rbx,%r13 - addq %r9,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%r9 - vpxor %xmm10,%xmm11,%xmm11 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - vpaddq %xmm11,%xmm5,%xmm5 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - rorq $23,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %rax,%r13 - xorq %rcx,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %r8,%r14 - vpaddq %xmm11,%xmm6,%xmm6 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 -.byte 143,72,120,195,209,7 - xorq %rcx,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,221,3 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm5,%xmm10 - addq %rdx,%r11 - addq %rdi,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %r11,%r13 - addq %rdx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rdx - vpxor %xmm10,%xmm11,%xmm11 - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - vpaddq %xmm11,%xmm6,%xmm6 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - rorq $23,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - rorq $5,%r14 -.byte 143,72,120,195,200,56 - xorq %r10,%r13 - xorq %rax,%r12 - vpsrlq $7,%xmm8,%xmm8 - rorq $4,%r13 - xorq %rcx,%r14 - vpaddq %xmm11,%xmm7,%xmm7 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 -.byte 143,72,120,195,209,7 - xorq %rax,%r12 - rorq $6,%r14 - vpxor %xmm9,%xmm8,%xmm8 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi -.byte 143,104,120,195,222,3 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - rorq $28,%r14 - vpsrlq $6,%xmm6,%xmm10 - addq %rbx,%r9 - addq %rdi,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r9,%r13 - addq %rbx,%r14 -.byte 143,72,120,195,203,42 - rorq $23,%r13 - movq %r14,%rbx - vpxor %xmm10,%xmm11,%xmm11 - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm9,%xmm11,%xmm11 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - vpaddq %xmm11,%xmm7,%xmm7 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$xop_00_47 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - rorq $23,%r13 - movq %r14,%rax - movq %r9,%r12 - rorq $5,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - rorq $4,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - rorq $6,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - rorq $14,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - rorq $28,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - rorq $23,%r13 - movq %r14,%r11 - movq %r8,%r12 - rorq $5,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - rorq $4,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - rorq $6,%r14 - xorq %rax,%rdi - addq %r12,%r10 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - rorq $28,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - rorq $23,%r13 - movq %r14,%r10 - movq %rdx,%r12 - rorq $5,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - rorq $4,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - rorq $6,%r14 - xorq %r11,%r15 - addq %r12,%r9 - rorq $14,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - rorq $28,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - rorq $23,%r13 - movq %r14,%r9 - movq %rcx,%r12 - rorq $5,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - rorq $4,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - rorq $6,%r14 - xorq %r10,%rdi - addq %r12,%r8 - rorq $14,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - rorq $28,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - rorq $23,%r13 - movq %r14,%r8 - movq %rbx,%r12 - rorq $5,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - rorq $4,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - rorq $6,%r14 - xorq %r9,%r15 - addq %r12,%rdx - rorq $14,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - rorq $28,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - rorq $23,%r13 - movq %r14,%rdx - movq %rax,%r12 - rorq $5,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - rorq $4,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - rorq $6,%r14 - xorq %r8,%rdi - addq %r12,%rcx - rorq $14,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - rorq $28,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - rorq $23,%r13 - movq %r14,%rcx - movq %r11,%r12 - rorq $5,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - rorq $4,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - rorq $6,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - rorq $14,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - rorq $28,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - rorq $23,%r13 - movq %r14,%rbx - movq %r10,%r12 - rorq $5,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - rorq $4,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - rorq $6,%r14 - xorq %rcx,%rdi - addq %r12,%rax - rorq $14,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - rorq $28,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_xop - - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_xop: - .byte 0xf3,0xc3 - - -.p2align 6 -sha512_block_data_order_avx: -L$avx_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -L$prologue_avx: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_avx -.p2align 4 -L$loop_avx: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$avx_00_47 - -.p2align 4 -L$avx_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm0,%xmm0 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 0(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm7,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm7,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm7,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 8(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm0,%xmm0 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm1,%xmm1 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 16(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm0,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm0,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm0,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 24(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm1,%xmm1 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm2,%xmm2 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 32(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm1,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm1,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm1,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 40(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm2,%xmm2 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm3,%xmm3 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 48(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm2,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm2,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm2,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 56(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm3,%xmm3 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm4,%xmm4 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 64(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm3,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm3,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm3,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 72(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm4,%xmm4 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm5,%xmm5 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 80(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm4,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm4,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm4,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 88(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm5,%xmm5 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm6,%xmm6 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 96(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm5,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm5,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm5,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 104(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm6,%xmm6 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm7,%xmm7 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 112(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm6,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm6,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm6,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 120(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm7,%xmm7 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$avx_00_47 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_avx - - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx: - .byte 0xf3,0xc3 - - -.p2align 6 -sha512_block_data_order_avx2: -L$avx2_shortcut: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - movq %rsp,%r11 - subq $1312,%rsp - shlq $4,%rdx - andq $-2048,%rsp - leaq (%rsi,%rdx,8),%rdx - addq $1152,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) -L$prologue_avx2: - - vzeroupper - subq $-128,%rsi - movq 0(%rdi),%rax - movq %rsi,%r12 - movq 8(%rdi),%rbx - cmpq %rdx,%rsi - movq 16(%rdi),%rcx - cmoveq %rsp,%r12 - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$oop_avx2 -.p2align 4 -L$oop_avx2: - vmovdqu -128(%rsi),%xmm0 - vmovdqu -128+16(%rsi),%xmm1 - vmovdqu -128+32(%rsi),%xmm2 - leaq K512+128(%rip),%rbp - vmovdqu -128+48(%rsi),%xmm3 - vmovdqu -128+64(%rsi),%xmm4 - vmovdqu -128+80(%rsi),%xmm5 - vmovdqu -128+96(%rsi),%xmm6 - vmovdqu -128+112(%rsi),%xmm7 - - vmovdqa 1152(%rbp),%ymm10 - vinserti128 $1,(%r12),%ymm0,%ymm0 - vinserti128 $1,16(%r12),%ymm1,%ymm1 - vpshufb %ymm10,%ymm0,%ymm0 - vinserti128 $1,32(%r12),%ymm2,%ymm2 - vpshufb %ymm10,%ymm1,%ymm1 - vinserti128 $1,48(%r12),%ymm3,%ymm3 - vpshufb %ymm10,%ymm2,%ymm2 - vinserti128 $1,64(%r12),%ymm4,%ymm4 - vpshufb %ymm10,%ymm3,%ymm3 - vinserti128 $1,80(%r12),%ymm5,%ymm5 - vpshufb %ymm10,%ymm4,%ymm4 - vinserti128 $1,96(%r12),%ymm6,%ymm6 - vpshufb %ymm10,%ymm5,%ymm5 - vinserti128 $1,112(%r12),%ymm7,%ymm7 - - vpaddq -128(%rbp),%ymm0,%ymm8 - vpshufb %ymm10,%ymm6,%ymm6 - vpaddq -96(%rbp),%ymm1,%ymm9 - vpshufb %ymm10,%ymm7,%ymm7 - vpaddq -64(%rbp),%ymm2,%ymm10 - vpaddq -32(%rbp),%ymm3,%ymm11 - vmovdqa %ymm8,0(%rsp) - vpaddq 0(%rbp),%ymm4,%ymm8 - vmovdqa %ymm9,32(%rsp) - vpaddq 32(%rbp),%ymm5,%ymm9 - vmovdqa %ymm10,64(%rsp) - vpaddq 64(%rbp),%ymm6,%ymm10 - vmovdqa %ymm11,96(%rsp) - leaq -128(%rsp),%rsp - vpaddq 96(%rbp),%ymm7,%ymm11 - vmovdqa %ymm8,0(%rsp) - xorq %r14,%r14 - vmovdqa %ymm9,32(%rsp) - movq %rbx,%rdi - vmovdqa %ymm10,64(%rsp) - xorq %rcx,%rdi - vmovdqa %ymm11,96(%rsp) - movq %r9,%r12 - addq $32*8,%rbp - jmp L$avx2_00_47 - -.p2align 4 -L$avx2_00_47: - leaq -128(%rsp),%rsp - vpalignr $8,%ymm0,%ymm1,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm4,%ymm5,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm0,%ymm0 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm7,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm7,%ymm10 - vpaddq %ymm8,%ymm0,%ymm0 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm7,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm0,%ymm0 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq -128(%rbp),%ymm0,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm5,%ymm6,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm1,%ymm1 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm0,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm0,%ymm10 - vpaddq %ymm8,%ymm1,%ymm1 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm0,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm1,%ymm1 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq -96(%rbp),%ymm1,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm6,%ymm7,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm2,%ymm2 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm1,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm1,%ymm10 - vpaddq %ymm8,%ymm2,%ymm2 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm1,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm2,%ymm2 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq -64(%rbp),%ymm2,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm7,%ymm0,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm3,%ymm3 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm2,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm2,%ymm10 - vpaddq %ymm8,%ymm3,%ymm3 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm2,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm3,%ymm3 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq -32(%rbp),%ymm3,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq -128(%rsp),%rsp - vpalignr $8,%ymm4,%ymm5,%ymm8 - addq 0+256(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - vpalignr $8,%ymm0,%ymm1,%ymm11 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - vpsrlq $1,%ymm8,%ymm10 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - vpaddq %ymm11,%ymm4,%ymm4 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - vpsrlq $6,%ymm3,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - vpsllq $3,%ymm3,%ymm10 - vpaddq %ymm8,%ymm4,%ymm4 - addq 8+256(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - vpsrlq $19,%ymm3,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - vpaddq %ymm11,%ymm4,%ymm4 - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - vpaddq 0(%rbp),%ymm4,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - vmovdqa %ymm10,0(%rsp) - vpalignr $8,%ymm5,%ymm6,%ymm8 - addq 32+256(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - vpalignr $8,%ymm1,%ymm2,%ymm11 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - vpsrlq $1,%ymm8,%ymm10 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - vpaddq %ymm11,%ymm5,%ymm5 - vpsrlq $7,%ymm8,%ymm11 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - vpsrlq $6,%ymm4,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - vpsllq $3,%ymm4,%ymm10 - vpaddq %ymm8,%ymm5,%ymm5 - addq 40+256(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - vpsrlq $19,%ymm4,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - vpaddq %ymm11,%ymm5,%ymm5 - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - vpaddq 32(%rbp),%ymm5,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - vmovdqa %ymm10,32(%rsp) - vpalignr $8,%ymm6,%ymm7,%ymm8 - addq 64+256(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - vpalignr $8,%ymm2,%ymm3,%ymm11 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - vpsrlq $1,%ymm8,%ymm10 - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - vpaddq %ymm11,%ymm6,%ymm6 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - vpsrlq $6,%ymm5,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - vpsllq $3,%ymm5,%ymm10 - vpaddq %ymm8,%ymm6,%ymm6 - addq 72+256(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - vpsrlq $19,%ymm5,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - vpaddq %ymm11,%ymm6,%ymm6 - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - vpaddq 64(%rbp),%ymm6,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - vmovdqa %ymm10,64(%rsp) - vpalignr $8,%ymm7,%ymm0,%ymm8 - addq 96+256(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - vpalignr $8,%ymm3,%ymm4,%ymm11 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - vpsrlq $1,%ymm8,%ymm10 - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - vpaddq %ymm11,%ymm7,%ymm7 - vpsrlq $7,%ymm8,%ymm11 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - vpsllq $56,%ymm8,%ymm9 - vpxor %ymm10,%ymm11,%ymm8 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - vpsrlq $7,%ymm10,%ymm10 - vpxor %ymm9,%ymm8,%ymm8 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - vpsllq $7,%ymm9,%ymm9 - vpxor %ymm10,%ymm8,%ymm8 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - vpsrlq $6,%ymm6,%ymm11 - vpxor %ymm9,%ymm8,%ymm8 - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - vpsllq $3,%ymm6,%ymm10 - vpaddq %ymm8,%ymm7,%ymm7 - addq 104+256(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - vpsrlq $19,%ymm6,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - vpsllq $42,%ymm10,%ymm10 - vpxor %ymm9,%ymm11,%ymm11 - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - vpsrlq $42,%ymm9,%ymm9 - vpxor %ymm10,%ymm11,%ymm11 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - vpxor %ymm9,%ymm11,%ymm11 - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - vpaddq %ymm11,%ymm7,%ymm7 - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - vpaddq 96(%rbp),%ymm7,%ymm10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - vmovdqa %ymm10,96(%rsp) - leaq 256(%rbp),%rbp - cmpb $0,-121(%rbp) - jne L$avx2_00_47 - addq 0+128(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+128(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+128(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+128(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+128(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+128(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+128(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+128(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - addq 0(%rsp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8(%rsp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32(%rsp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40(%rsp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64(%rsp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72(%rsp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96(%rsp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104(%rsp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rbp - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - cmpq 144(%rbp),%rsi - je L$done_avx2 - - xorq %r14,%r14 - movq %rbx,%rdi - xorq %rcx,%rdi - movq %r9,%r12 - jmp L$ower_avx2 -.p2align 4 -L$ower_avx2: - addq 0+16(%rbp),%r11 - andq %r8,%r12 - rorxq $41,%r8,%r13 - rorxq $18,%r8,%r15 - leaq (%rax,%r14,1),%rax - leaq (%r11,%r12,1),%r11 - andnq %r10,%r8,%r12 - xorq %r15,%r13 - rorxq $14,%r8,%r14 - leaq (%r11,%r12,1),%r11 - xorq %r14,%r13 - movq %rax,%r15 - rorxq $39,%rax,%r12 - leaq (%r11,%r13,1),%r11 - xorq %rbx,%r15 - rorxq $34,%rax,%r14 - rorxq $28,%rax,%r13 - leaq (%rdx,%r11,1),%rdx - andq %r15,%rdi - xorq %r12,%r14 - xorq %rbx,%rdi - xorq %r13,%r14 - leaq (%r11,%rdi,1),%r11 - movq %r8,%r12 - addq 8+16(%rbp),%r10 - andq %rdx,%r12 - rorxq $41,%rdx,%r13 - rorxq $18,%rdx,%rdi - leaq (%r11,%r14,1),%r11 - leaq (%r10,%r12,1),%r10 - andnq %r9,%rdx,%r12 - xorq %rdi,%r13 - rorxq $14,%rdx,%r14 - leaq (%r10,%r12,1),%r10 - xorq %r14,%r13 - movq %r11,%rdi - rorxq $39,%r11,%r12 - leaq (%r10,%r13,1),%r10 - xorq %rax,%rdi - rorxq $34,%r11,%r14 - rorxq $28,%r11,%r13 - leaq (%rcx,%r10,1),%rcx - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rax,%r15 - xorq %r13,%r14 - leaq (%r10,%r15,1),%r10 - movq %rdx,%r12 - addq 32+16(%rbp),%r9 - andq %rcx,%r12 - rorxq $41,%rcx,%r13 - rorxq $18,%rcx,%r15 - leaq (%r10,%r14,1),%r10 - leaq (%r9,%r12,1),%r9 - andnq %r8,%rcx,%r12 - xorq %r15,%r13 - rorxq $14,%rcx,%r14 - leaq (%r9,%r12,1),%r9 - xorq %r14,%r13 - movq %r10,%r15 - rorxq $39,%r10,%r12 - leaq (%r9,%r13,1),%r9 - xorq %r11,%r15 - rorxq $34,%r10,%r14 - rorxq $28,%r10,%r13 - leaq (%rbx,%r9,1),%rbx - andq %r15,%rdi - xorq %r12,%r14 - xorq %r11,%rdi - xorq %r13,%r14 - leaq (%r9,%rdi,1),%r9 - movq %rcx,%r12 - addq 40+16(%rbp),%r8 - andq %rbx,%r12 - rorxq $41,%rbx,%r13 - rorxq $18,%rbx,%rdi - leaq (%r9,%r14,1),%r9 - leaq (%r8,%r12,1),%r8 - andnq %rdx,%rbx,%r12 - xorq %rdi,%r13 - rorxq $14,%rbx,%r14 - leaq (%r8,%r12,1),%r8 - xorq %r14,%r13 - movq %r9,%rdi - rorxq $39,%r9,%r12 - leaq (%r8,%r13,1),%r8 - xorq %r10,%rdi - rorxq $34,%r9,%r14 - rorxq $28,%r9,%r13 - leaq (%rax,%r8,1),%rax - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r10,%r15 - xorq %r13,%r14 - leaq (%r8,%r15,1),%r8 - movq %rbx,%r12 - addq 64+16(%rbp),%rdx - andq %rax,%r12 - rorxq $41,%rax,%r13 - rorxq $18,%rax,%r15 - leaq (%r8,%r14,1),%r8 - leaq (%rdx,%r12,1),%rdx - andnq %rcx,%rax,%r12 - xorq %r15,%r13 - rorxq $14,%rax,%r14 - leaq (%rdx,%r12,1),%rdx - xorq %r14,%r13 - movq %r8,%r15 - rorxq $39,%r8,%r12 - leaq (%rdx,%r13,1),%rdx - xorq %r9,%r15 - rorxq $34,%r8,%r14 - rorxq $28,%r8,%r13 - leaq (%r11,%rdx,1),%r11 - andq %r15,%rdi - xorq %r12,%r14 - xorq %r9,%rdi - xorq %r13,%r14 - leaq (%rdx,%rdi,1),%rdx - movq %rax,%r12 - addq 72+16(%rbp),%rcx - andq %r11,%r12 - rorxq $41,%r11,%r13 - rorxq $18,%r11,%rdi - leaq (%rdx,%r14,1),%rdx - leaq (%rcx,%r12,1),%rcx - andnq %rbx,%r11,%r12 - xorq %rdi,%r13 - rorxq $14,%r11,%r14 - leaq (%rcx,%r12,1),%rcx - xorq %r14,%r13 - movq %rdx,%rdi - rorxq $39,%rdx,%r12 - leaq (%rcx,%r13,1),%rcx - xorq %r8,%rdi - rorxq $34,%rdx,%r14 - rorxq $28,%rdx,%r13 - leaq (%r10,%rcx,1),%r10 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %r8,%r15 - xorq %r13,%r14 - leaq (%rcx,%r15,1),%rcx - movq %r11,%r12 - addq 96+16(%rbp),%rbx - andq %r10,%r12 - rorxq $41,%r10,%r13 - rorxq $18,%r10,%r15 - leaq (%rcx,%r14,1),%rcx - leaq (%rbx,%r12,1),%rbx - andnq %rax,%r10,%r12 - xorq %r15,%r13 - rorxq $14,%r10,%r14 - leaq (%rbx,%r12,1),%rbx - xorq %r14,%r13 - movq %rcx,%r15 - rorxq $39,%rcx,%r12 - leaq (%rbx,%r13,1),%rbx - xorq %rdx,%r15 - rorxq $34,%rcx,%r14 - rorxq $28,%rcx,%r13 - leaq (%r9,%rbx,1),%r9 - andq %r15,%rdi - xorq %r12,%r14 - xorq %rdx,%rdi - xorq %r13,%r14 - leaq (%rbx,%rdi,1),%rbx - movq %r10,%r12 - addq 104+16(%rbp),%rax - andq %r9,%r12 - rorxq $41,%r9,%r13 - rorxq $18,%r9,%rdi - leaq (%rbx,%r14,1),%rbx - leaq (%rax,%r12,1),%rax - andnq %r11,%r9,%r12 - xorq %rdi,%r13 - rorxq $14,%r9,%r14 - leaq (%rax,%r12,1),%rax - xorq %r14,%r13 - movq %rbx,%rdi - rorxq $39,%rbx,%r12 - leaq (%rax,%r13,1),%rax - xorq %rcx,%rdi - rorxq $34,%rbx,%r14 - rorxq $28,%rbx,%r13 - leaq (%r8,%rax,1),%r8 - andq %rdi,%r15 - xorq %r12,%r14 - xorq %rcx,%r15 - xorq %r13,%r14 - leaq (%rax,%r15,1),%rax - movq %r9,%r12 - leaq -128(%rbp),%rbp - cmpq %rsp,%rbp - jae L$ower_avx2 - - movq 1280(%rsp),%rdi - addq %r14,%rax - - leaq 1152(%rsp),%rsp - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - leaq 256(%rsi),%rsi - addq 48(%rdi),%r10 - movq %rsi,%r12 - addq 56(%rdi),%r11 - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - cmoveq %rsp,%r12 - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - - jbe L$oop_avx2 - leaq (%rsp),%rbp - -L$done_avx2: - leaq (%rbp),%rsp - movq 128+24(%rsp),%rsi - vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp -L$epilogue_avx2: - .byte 0xf3,0xc3 diff --git a/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm b/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm index 0d3107834e6983..c24d0c5e6a3d3d 100644 --- a/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm +++ b/deps/openssl/asm/x64-win32-masm/bn/rsaz-avx2.asm @@ -1628,8 +1628,9 @@ PUBLIC rsaz_1024_gather5_avx2 ALIGN 32 rsaz_1024_gather5_avx2 PROC PUBLIC - lea rax,QWORD PTR[((-136))+rsp] vzeroupper + mov r11,rsp + lea rax,QWORD PTR[((-136))+rsp] $L$SEH_begin_rsaz_1024_gather5:: DB 048h,08dh,060h,0e0h @@ -1643,66 +1644,125 @@ DB 0c5h,078h,029h,060h,040h DB 0c5h,078h,029h,068h,050h DB 0c5h,078h,029h,070h,060h DB 0c5h,078h,029h,078h,070h - lea r11,QWORD PTR[$L$gather_table] - mov eax,r8d - and r8d,3 - shr eax,2 - shl r8d,4 - - vmovdqu ymm7,YMMWORD PTR[((-32))+r11] - vpbroadcastb xmm8,BYTE PTR[8+rax*1+r11] - vpbroadcastb xmm9,BYTE PTR[7+rax*1+r11] - vpbroadcastb xmm10,BYTE PTR[6+rax*1+r11] - vpbroadcastb xmm11,BYTE PTR[5+rax*1+r11] - vpbroadcastb xmm12,BYTE PTR[4+rax*1+r11] - vpbroadcastb xmm13,BYTE PTR[3+rax*1+r11] - vpbroadcastb xmm14,BYTE PTR[2+rax*1+r11] - vpbroadcastb xmm15,BYTE PTR[1+rax*1+r11] - - lea rdx,QWORD PTR[64+r8*1+rdx] - mov r11,64 - mov eax,9 - jmp $L$oop_gather_1024 + lea rsp,QWORD PTR[((-256))+rsp] + and rsp,-32 + lea r10,QWORD PTR[$L$inc] + lea rax,QWORD PTR[((-128))+rsp] + + vmovd xmm4,r8d + vmovdqa ymm0,YMMWORD PTR[r10] + vmovdqa ymm1,YMMWORD PTR[32+r10] + vmovdqa ymm5,YMMWORD PTR[64+r10] + vpbroadcastd ymm4,xmm4 + + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD PTR[(0+128)+rax],ymm0 + vpaddd ymm0,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD PTR[(32+128)+rax],ymm1 + vpaddd ymm1,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD PTR[(64+128)+rax],ymm2 + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vmovdqa YMMWORD PTR[(96+128)+rax],ymm3 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD PTR[(128+128)+rax],ymm0 + vpaddd ymm8,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD PTR[(160+128)+rax],ymm1 + vpaddd ymm9,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD PTR[(192+128)+rax],ymm2 + vpaddd ymm10,ymm8,ymm5 + vpcmpeqd ymm8,ymm8,ymm4 + vmovdqa YMMWORD PTR[(224+128)+rax],ymm3 + vpaddd ymm11,ymm9,ymm5 + vpcmpeqd ymm9,ymm9,ymm4 + vpaddd ymm12,ymm10,ymm5 + vpcmpeqd ymm10,ymm10,ymm4 + vpaddd ymm13,ymm11,ymm5 + vpcmpeqd ymm11,ymm11,ymm4 + vpaddd ymm14,ymm12,ymm5 + vpcmpeqd ymm12,ymm12,ymm4 + vpaddd ymm15,ymm13,ymm5 + vpcmpeqd ymm13,ymm13,ymm4 + vpcmpeqd ymm14,ymm14,ymm4 + vpcmpeqd ymm15,ymm15,ymm4 + + vmovdqa ymm7,YMMWORD PTR[((-32))+r10] + lea rdx,QWORD PTR[128+rdx] + mov r8d,9 -ALIGN 32 $L$oop_gather_1024:: - vpand xmm0,xmm8,XMMWORD PTR[((-64))+rdx] - vpand xmm1,xmm9,XMMWORD PTR[rdx] - vpand xmm2,xmm10,XMMWORD PTR[64+rdx] - vpand xmm3,xmm11,XMMWORD PTR[r11*2+rdx] - vpor xmm1,xmm1,xmm0 - vpand xmm4,xmm12,XMMWORD PTR[64+r11*2+rdx] - vpor xmm3,xmm3,xmm2 - vpand xmm5,xmm13,XMMWORD PTR[r11*4+rdx] - vpor xmm3,xmm3,xmm1 - vpand xmm6,xmm14,XMMWORD PTR[64+r11*4+rdx] + vmovdqa ymm0,YMMWORD PTR[((0-128))+rdx] + vmovdqa ymm1,YMMWORD PTR[((32-128))+rdx] + vmovdqa ymm2,YMMWORD PTR[((64-128))+rdx] + vmovdqa ymm3,YMMWORD PTR[((96-128))+rdx] + vpand ymm0,ymm0,YMMWORD PTR[((0+128))+rax] + vpand ymm1,ymm1,YMMWORD PTR[((32+128))+rax] + vpand ymm2,ymm2,YMMWORD PTR[((64+128))+rax] + vpor ymm4,ymm1,ymm0 + vpand ymm3,ymm3,YMMWORD PTR[((96+128))+rax] + vmovdqa ymm0,YMMWORD PTR[((128-128))+rdx] + vmovdqa ymm1,YMMWORD PTR[((160-128))+rdx] + vpor ymm5,ymm3,ymm2 + vmovdqa ymm2,YMMWORD PTR[((192-128))+rdx] + vmovdqa ymm3,YMMWORD PTR[((224-128))+rdx] + vpand ymm0,ymm0,YMMWORD PTR[((128+128))+rax] + vpand ymm1,ymm1,YMMWORD PTR[((160+128))+rax] + vpand ymm2,ymm2,YMMWORD PTR[((192+128))+rax] + vpor ymm4,ymm4,ymm0 + vpand ymm3,ymm3,YMMWORD PTR[((224+128))+rax] + vpand ymm0,ymm8,YMMWORD PTR[((256-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm9,YMMWORD PTR[((288-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm10,YMMWORD PTR[((320-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm11,YMMWORD PTR[((352-128))+rdx] + vpor ymm4,ymm4,ymm0 + vpand ymm0,ymm12,YMMWORD PTR[((384-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm13,YMMWORD PTR[((416-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm14,YMMWORD PTR[((448-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm15,YMMWORD PTR[((480-128))+rdx] + lea rdx,QWORD PTR[512+rdx] + vpor ymm4,ymm4,ymm0 + vpor ymm5,ymm5,ymm1 + vpor ymm4,ymm4,ymm2 + vpor ymm5,ymm5,ymm3 + + vpor ymm4,ymm4,ymm5 + vextracti128 xmm5,ymm4,1 vpor xmm5,xmm5,xmm4 - vpand xmm2,xmm15,XMMWORD PTR[((-128))+r11*8+rdx] - lea rdx,QWORD PTR[r11*8+rdx] - vpor xmm5,xmm5,xmm3 - vpor xmm6,xmm6,xmm2 - vpor xmm6,xmm6,xmm5 - vpermd ymm6,ymm7,ymm6 - vmovdqu YMMWORD PTR[rcx],ymm6 + vpermd ymm5,ymm7,ymm5 + vmovdqu YMMWORD PTR[rcx],ymm5 lea rcx,QWORD PTR[32+rcx] - dec eax + dec r8d jnz $L$oop_gather_1024 vpxor ymm0,ymm0,ymm0 vmovdqu YMMWORD PTR[rcx],ymm0 vzeroupper - movaps xmm6,XMMWORD PTR[rsp] - movaps xmm7,XMMWORD PTR[16+rsp] - movaps xmm8,XMMWORD PTR[32+rsp] - movaps xmm9,XMMWORD PTR[48+rsp] - movaps xmm10,XMMWORD PTR[64+rsp] - movaps xmm11,XMMWORD PTR[80+rsp] - movaps xmm12,XMMWORD PTR[96+rsp] - movaps xmm13,XMMWORD PTR[112+rsp] - movaps xmm14,XMMWORD PTR[128+rsp] - movaps xmm15,XMMWORD PTR[144+rsp] - lea rsp,QWORD PTR[168+rsp] + movaps xmm6,XMMWORD PTR[((-168))+r11] + movaps xmm7,XMMWORD PTR[((-152))+r11] + movaps xmm8,XMMWORD PTR[((-136))+r11] + movaps xmm9,XMMWORD PTR[((-120))+r11] + movaps xmm10,XMMWORD PTR[((-104))+r11] + movaps xmm11,XMMWORD PTR[((-88))+r11] + movaps xmm12,XMMWORD PTR[((-72))+r11] + movaps xmm13,XMMWORD PTR[((-56))+r11] + movaps xmm14,XMMWORD PTR[((-40))+r11] + movaps xmm15,XMMWORD PTR[((-24))+r11] $L$SEH_end_rsaz_1024_gather5:: + lea rsp,QWORD PTR[r11] DB 0F3h,0C3h ;repret rsaz_1024_gather5_avx2 ENDP EXTERN OPENSSL_ia32cap_P:NEAR @@ -1728,8 +1788,10 @@ $L$scatter_permd:: DD 0,2,4,6,7,7,7,7 $L$gather_permd:: DD 0,7,1,7,2,7,3,7 -$L$gather_table:: -DB 0,0,0,0,0,0,0,0,0ffh,0,0,0,0,0,0,0 +$L$inc:: + DD 0,0,0,0,1,1,1,1 + DD 2,2,2,2,3,3,3,3 + DD 4,4,4,4,4,4,4,4 ALIGN 64 EXTERN __imp_RtlVirtualUnwind:NEAR @@ -1850,7 +1912,7 @@ DB 9,0,0,0 DD imagerel rsaz_se_handler DD imagerel $L$mul_1024_body,imagerel $L$mul_1024_epilogue $L$SEH_info_rsaz_1024_gather5:: -DB 001h,033h,016h,000h +DB 001h,036h,017h,00bh DB 036h,0f8h,009h,000h DB 031h,0e8h,008h,000h DB 02ch,0d8h,007h,000h @@ -1862,6 +1924,7 @@ DB 013h,088h,002h,000h DB 00eh,078h,001h,000h DB 009h,068h,000h,000h DB 004h,001h,015h,000h +DB 000h,0b3h,000h,000h .xdata ENDS END diff --git a/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm b/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm index 1c6440470d9992..e431b620902c8b 100644 --- a/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm +++ b/deps/openssl/asm/x64-win32-masm/bn/rsaz-x86_64.asm @@ -803,52 +803,108 @@ $L$SEH_begin_rsaz_512_mul_gather4:: push r14 push r15 - mov r9d,r9d - sub rsp,128+24 + sub rsp,328 + movaps XMMWORD PTR[160+rsp],xmm6 + movaps XMMWORD PTR[176+rsp],xmm7 + movaps XMMWORD PTR[192+rsp],xmm8 + movaps XMMWORD PTR[208+rsp],xmm9 + movaps XMMWORD PTR[224+rsp],xmm10 + movaps XMMWORD PTR[240+rsp],xmm11 + movaps XMMWORD PTR[256+rsp],xmm12 + movaps XMMWORD PTR[272+rsp],xmm13 + movaps XMMWORD PTR[288+rsp],xmm14 + movaps XMMWORD PTR[304+rsp],xmm15 $L$mul_gather4_body:: + movd xmm8,r9d + movdqa xmm1,XMMWORD PTR[(($L$inc+16))] + movdqa xmm0,XMMWORD PTR[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 + + movdqa xmm8,XMMWORD PTR[rdx] + movdqa xmm9,XMMWORD PTR[16+rdx] + movdqa xmm10,XMMWORD PTR[32+rdx] + movdqa xmm11,XMMWORD PTR[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rdx] + lea rbp,QWORD PTR[128+rdx] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 mov r11d,080100h and r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))] cmp r11d,080100h je $L$mulx_gather - mov eax,DWORD PTR[64+r9*4+rdx] -DB 102,72,15,110,199 - mov ebx,DWORD PTR[r9*4+rdx] -DB 102,72,15,110,201 +DB 102,76,15,126,195 + mov QWORD PTR[128+rsp],r8 + mov QWORD PTR[((128+8))+rsp],rdi + mov QWORD PTR[((128+16))+rsp],rcx - shl rax,32 - or rbx,rax mov rax,QWORD PTR[rsi] mov rcx,QWORD PTR[8+rsi] - lea rbp,QWORD PTR[128+r9*4+rdx] mul rbx mov QWORD PTR[rsp],rax mov rax,rcx mov r8,rdx mul rbx - movd xmm4,DWORD PTR[rbp] add r8,rax mov rax,QWORD PTR[16+rsi] mov r9,rdx adc r9,0 mul rbx - movd xmm5,DWORD PTR[64+rbp] add r9,rax mov rax,QWORD PTR[24+rsi] mov r10,rdx adc r10,0 mul rbx - pslldq xmm5,4 add r10,rax mov rax,QWORD PTR[32+rsi] mov r11,rdx adc r11,0 mul rbx - por xmm4,xmm5 add r11,rax mov rax,QWORD PTR[40+rsi] mov r12,rdx @@ -861,14 +917,12 @@ DB 102,72,15,110,201 adc r13,0 mul rbx - lea rbp,QWORD PTR[128+rbp] add r13,rax mov rax,QWORD PTR[56+rsi] mov r14,rdx adc r14,0 mul rbx -DB 102,72,15,126,227 add r14,rax mov rax,QWORD PTR[rsi] mov r15,rdx @@ -880,6 +934,35 @@ DB 102,72,15,126,227 ALIGN 32 $L$oop_mul_gather:: + movdqa xmm8,XMMWORD PTR[rbp] + movdqa xmm9,XMMWORD PTR[16+rbp] + movdqa xmm10,XMMWORD PTR[32+rbp] + movdqa xmm11,XMMWORD PTR[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rbp] + lea rbp,QWORD PTR[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 +DB 102,76,15,126,195 + mul rbx add r8,rax mov rax,QWORD PTR[8+rsi] @@ -888,7 +971,6 @@ $L$oop_mul_gather:: adc r8,0 mul rbx - movd xmm4,DWORD PTR[rbp] add r9,rax mov rax,QWORD PTR[16+rsi] adc rdx,0 @@ -897,7 +979,6 @@ $L$oop_mul_gather:: adc r9,0 mul rbx - movd xmm5,DWORD PTR[64+rbp] add r10,rax mov rax,QWORD PTR[24+rsi] adc rdx,0 @@ -906,7 +987,6 @@ $L$oop_mul_gather:: adc r10,0 mul rbx - pslldq xmm5,4 add r11,rax mov rax,QWORD PTR[32+rsi] adc rdx,0 @@ -915,7 +995,6 @@ $L$oop_mul_gather:: adc r11,0 mul rbx - por xmm4,xmm5 add r12,rax mov rax,QWORD PTR[40+rsi] adc rdx,0 @@ -940,7 +1019,6 @@ $L$oop_mul_gather:: adc r14,0 mul rbx -DB 102,72,15,126,227 add r15,rax mov rax,QWORD PTR[rsi] adc rdx,0 @@ -948,7 +1026,6 @@ DB 102,72,15,126,227 mov r15,rdx adc r15,0 - lea rbp,QWORD PTR[128+rbp] lea rdi,QWORD PTR[8+rdi] dec ecx @@ -963,8 +1040,8 @@ DB 102,72,15,126,227 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 -DB 102,72,15,126,199 -DB 102,72,15,126,205 + mov rdi,QWORD PTR[((128+8))+rsp] + mov rbp,QWORD PTR[((128+16))+rsp] mov r8,QWORD PTR[rsp] mov r9,QWORD PTR[8+rsp] @@ -980,45 +1057,37 @@ DB 102,72,15,126,205 ALIGN 32 $L$mulx_gather:: - mov eax,DWORD PTR[64+r9*4+rdx] -DB 102,72,15,110,199 - lea rbp,QWORD PTR[128+r9*4+rdx] - mov edx,DWORD PTR[r9*4+rdx] -DB 102,72,15,110,201 +DB 102,76,15,126,194 + mov QWORD PTR[128+rsp],r8 + mov QWORD PTR[((128+8))+rsp],rdi + mov QWORD PTR[((128+16))+rsp],rcx - shl rax,32 - or rdx,rax mulx r8,rbx,QWORD PTR[rsi] mov QWORD PTR[rsp],rbx xor edi,edi mulx r9,rax,QWORD PTR[8+rsi] - movd xmm4,DWORD PTR[rbp] mulx r10,rbx,QWORD PTR[16+rsi] - movd xmm5,DWORD PTR[64+rbp] adcx r8,rax mulx r11,rax,QWORD PTR[24+rsi] - pslldq xmm5,4 adcx r9,rbx mulx r12,rbx,QWORD PTR[32+rsi] - por xmm4,xmm5 adcx r10,rax mulx r13,rax,QWORD PTR[40+rsi] adcx r11,rbx mulx r14,rbx,QWORD PTR[48+rsi] - lea rbp,QWORD PTR[128+rbp] adcx r12,rax mulx r15,rax,QWORD PTR[56+rsi] -DB 102,72,15,126,226 adcx r13,rbx adcx r14,rax +DB 067h mov rbx,r8 adcx r15,rdi @@ -1027,24 +1096,48 @@ DB 102,72,15,126,226 ALIGN 32 $L$oop_mulx_gather:: - mulx r8,rax,QWORD PTR[rsi] + movdqa xmm8,XMMWORD PTR[rbp] + movdqa xmm9,XMMWORD PTR[16+rbp] + movdqa xmm10,XMMWORD PTR[32+rbp] + movdqa xmm11,XMMWORD PTR[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rbp] + lea rbp,QWORD PTR[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 +DB 102,76,15,126,194 + +DB 0c4h,062h,0fbh,0f6h,086h,000h,000h,000h,000h adcx rbx,rax adox r8,r9 mulx r9,rax,QWORD PTR[8+rsi] -DB 066h,00fh,06eh,0a5h,000h,000h,000h,000h adcx r8,rax adox r9,r10 mulx r10,rax,QWORD PTR[16+rsi] - movd xmm5,DWORD PTR[64+rbp] - lea rbp,QWORD PTR[128+rbp] adcx r9,rax adox r10,r11 DB 0c4h,062h,0fbh,0f6h,09eh,018h,000h,000h,000h - pslldq xmm5,4 - por xmm4,xmm5 adcx r10,rax adox r11,r12 @@ -1058,10 +1151,10 @@ DB 0c4h,062h,0fbh,0f6h,09eh,018h,000h,000h,000h DB 0c4h,062h,0fbh,0f6h,0b6h,030h,000h,000h,000h adcx r13,rax +DB 067h adox r14,r15 mulx r15,rax,QWORD PTR[56+rsi] -DB 102,72,15,126,226 mov QWORD PTR[64+rcx*8+rsp],rbx adcx r14,rax adox r15,rdi @@ -1080,10 +1173,10 @@ DB 102,72,15,126,226 mov QWORD PTR[((64+48))+rsp],r14 mov QWORD PTR[((64+56))+rsp],r15 -DB 102,72,15,126,199 -DB 102,72,15,126,205 - mov rdx,QWORD PTR[128+rsp] + mov rdi,QWORD PTR[((128+8))+rsp] + mov rbp,QWORD PTR[((128+16))+rsp] + mov r8,QWORD PTR[rsp] mov r9,QWORD PTR[8+rsp] mov r10,QWORD PTR[16+rsp] @@ -1109,6 +1202,17 @@ $L$mul_gather_tail:: call __rsaz_512_subtract lea rax,QWORD PTR[((128+24+48))+rsp] + movaps xmm6,XMMWORD PTR[((160-200))+rax] + movaps xmm7,XMMWORD PTR[((176-200))+rax] + movaps xmm8,XMMWORD PTR[((192-200))+rax] + movaps xmm9,XMMWORD PTR[((208-200))+rax] + movaps xmm10,XMMWORD PTR[((224-200))+rax] + movaps xmm11,XMMWORD PTR[((240-200))+rax] + movaps xmm12,XMMWORD PTR[((256-200))+rax] + movaps xmm13,XMMWORD PTR[((272-200))+rax] + movaps xmm14,XMMWORD PTR[((288-200))+rax] + movaps xmm15,XMMWORD PTR[((304-200))+rax] + lea rax,QWORD PTR[176+rax] mov r15,QWORD PTR[((-48))+rax] mov r14,QWORD PTR[((-40))+rax] mov r13,QWORD PTR[((-32))+rax] @@ -1148,7 +1252,7 @@ $L$SEH_begin_rsaz_512_mul_scatter4:: mov r9d,r9d sub rsp,128+24 $L$mul_scatter4_body:: - lea r8,QWORD PTR[r9*4+r8] + lea r8,QWORD PTR[r9*8+r8] DB 102,72,15,110,199 DB 102,72,15,110,202 DB 102,73,15,110,208 @@ -1211,30 +1315,14 @@ DB 102,72,15,126,214 call __rsaz_512_subtract - mov DWORD PTR[rsi],r8d - shr r8,32 - mov DWORD PTR[128+rsi],r9d - shr r9,32 - mov DWORD PTR[256+rsi],r10d - shr r10,32 - mov DWORD PTR[384+rsi],r11d - shr r11,32 - mov DWORD PTR[512+rsi],r12d - shr r12,32 - mov DWORD PTR[640+rsi],r13d - shr r13,32 - mov DWORD PTR[768+rsi],r14d - shr r14,32 - mov DWORD PTR[896+rsi],r15d - shr r15,32 - mov DWORD PTR[64+rsi],r8d - mov DWORD PTR[192+rsi],r9d - mov DWORD PTR[320+rsi],r10d - mov DWORD PTR[448+rsi],r11d - mov DWORD PTR[576+rsi],r12d - mov DWORD PTR[704+rsi],r13d - mov DWORD PTR[832+rsi],r14d - mov DWORD PTR[960+rsi],r15d + mov QWORD PTR[rsi],r8 + mov QWORD PTR[128+rsi],r9 + mov QWORD PTR[256+rsi],r10 + mov QWORD PTR[384+rsi],r11 + mov QWORD PTR[512+rsi],r12 + mov QWORD PTR[640+rsi],r13 + mov QWORD PTR[768+rsi],r14 + mov QWORD PTR[896+rsi],r15 lea rax,QWORD PTR[((128+24+48))+rsp] mov r15,QWORD PTR[((-48))+rax] @@ -1789,16 +1877,14 @@ PUBLIC rsaz_512_scatter4 ALIGN 16 rsaz_512_scatter4 PROC PUBLIC - lea rcx,QWORD PTR[r8*4+rcx] + lea rcx,QWORD PTR[r8*8+rcx] mov r9d,8 jmp $L$oop_scatter ALIGN 16 $L$oop_scatter:: mov rax,QWORD PTR[rdx] lea rdx,QWORD PTR[8+rdx] - mov DWORD PTR[rcx],eax - shr rax,32 - mov DWORD PTR[64+rcx],eax + mov QWORD PTR[rcx],rax lea rcx,QWORD PTR[128+rcx] dec r9d jnz $L$oop_scatter @@ -1809,22 +1895,98 @@ PUBLIC rsaz_512_gather4 ALIGN 16 rsaz_512_gather4 PROC PUBLIC - lea rdx,QWORD PTR[r8*4+rdx] +$L$SEH_begin_rsaz_512_gather4:: +DB 048h,081h,0ech,0a8h,000h,000h,000h +DB 00fh,029h,034h,024h +DB 00fh,029h,07ch,024h,010h +DB 044h,00fh,029h,044h,024h,020h +DB 044h,00fh,029h,04ch,024h,030h +DB 044h,00fh,029h,054h,024h,040h +DB 044h,00fh,029h,05ch,024h,050h +DB 044h,00fh,029h,064h,024h,060h +DB 044h,00fh,029h,06ch,024h,070h +DB 044h,00fh,029h,0b4h,024h,080h,0,0,0 +DB 044h,00fh,029h,0bch,024h,090h,0,0,0 + movd xmm8,r8d + movdqa xmm1,XMMWORD PTR[(($L$inc+16))] + movdqa xmm0,XMMWORD PTR[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 mov r9d,8 jmp $L$oop_gather ALIGN 16 $L$oop_gather:: - mov eax,DWORD PTR[rdx] - mov r8d,DWORD PTR[64+rdx] + movdqa xmm8,XMMWORD PTR[rdx] + movdqa xmm9,XMMWORD PTR[16+rdx] + movdqa xmm10,XMMWORD PTR[32+rdx] + movdqa xmm11,XMMWORD PTR[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rdx] lea rdx,QWORD PTR[128+rdx] - shl r8,32 - or rax,r8 - mov QWORD PTR[rcx],rax + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 + movq QWORD PTR[rcx],xmm8 lea rcx,QWORD PTR[8+rcx] dec r9d jnz $L$oop_gather + movaps xmm6,XMMWORD PTR[rsp] + movaps xmm7,XMMWORD PTR[16+rsp] + movaps xmm8,XMMWORD PTR[32+rsp] + movaps xmm9,XMMWORD PTR[48+rsp] + movaps xmm10,XMMWORD PTR[64+rsp] + movaps xmm11,XMMWORD PTR[80+rsp] + movaps xmm12,XMMWORD PTR[96+rsp] + movaps xmm13,XMMWORD PTR[112+rsp] + movaps xmm14,XMMWORD PTR[128+rsp] + movaps xmm15,XMMWORD PTR[144+rsp] + add rsp,0a8h DB 0F3h,0C3h ;repret +$L$SEH_end_rsaz_512_gather4:: rsaz_512_gather4 ENDP + +ALIGN 64 +$L$inc:: + DD 0,0,1,1 + DD 2,2,2,2 EXTERN __imp_RtlVirtualUnwind:NEAR ALIGN 16 @@ -1860,6 +2022,18 @@ se_handler PROC PRIVATE lea rax,QWORD PTR[((128+24+48))+rax] + lea rbx,QWORD PTR[$L$mul_gather4_epilogue] + cmp rbx,r10 + jne $L$se_not_in_mul_gather4 + + lea rax,QWORD PTR[176+rax] + + lea rsi,QWORD PTR[((-48-168))+rax] + lea rdi,QWORD PTR[512+r8] + mov ecx,20 + DD 0a548f3fch + +$L$se_not_in_mul_gather4:: mov rbx,QWORD PTR[((-8))+rax] mov rbp,QWORD PTR[((-16))+rax] mov r12,QWORD PTR[((-24))+rax] @@ -1936,6 +2110,10 @@ ALIGN 4 DD imagerel $L$SEH_end_rsaz_512_mul_by_one DD imagerel $L$SEH_info_rsaz_512_mul_by_one + DD imagerel $L$SEH_begin_rsaz_512_gather4 + DD imagerel $L$SEH_end_rsaz_512_gather4 + DD imagerel $L$SEH_info_rsaz_512_gather4 + .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 @@ -1959,6 +2137,19 @@ $L$SEH_info_rsaz_512_mul_by_one:: DB 9,0,0,0 DD imagerel se_handler DD imagerel $L$mul_by_one_body,imagerel $L$mul_by_one_epilogue +$L$SEH_info_rsaz_512_gather4:: +DB 001h,046h,016h,000h +DB 046h,0f8h,009h,000h +DB 03dh,0e8h,008h,000h +DB 034h,0d8h,007h,000h +DB 02eh,0c8h,006h,000h +DB 028h,0b8h,005h,000h +DB 022h,0a8h,004h,000h +DB 01ch,098h,003h,000h +DB 016h,088h,002h,000h +DB 010h,078h,001h,000h +DB 00bh,068h,000h,000h +DB 007h,001h,015h,000h .xdata ENDS END diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm index f2527450608007..e70ec9f31ab574 100644 --- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm +++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont.asm @@ -681,20 +681,20 @@ $L$sqr8x_enter:: - lea r11,QWORD PTR[((-64))+r9*4+rsp] + lea r11,QWORD PTR[((-64))+r9*2+rsp] mov r8,QWORD PTR[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb $L$sqr8x_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*4+rsp] + lea rsp,QWORD PTR[((-64))+r9*2+rsp] jmp $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*4] - lea rsp,QWORD PTR[((-64))+r9*4+rsp] + lea r10,QWORD PTR[((4096-64))+r9*2] + lea rsp,QWORD PTR[((-64))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -704,73 +704,99 @@ $L$sqr8x_sp_done:: mov r10,r9 neg r9 - lea r11,QWORD PTR[64+r9*2+rsp] mov QWORD PTR[32+rsp],r8 mov QWORD PTR[40+rsp],rax $L$sqr8x_body:: - mov rbp,r9 -DB 102,73,15,110,211 - shr rbp,3+2 - mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))] - jmp $L$sqr8x_copy_n - -ALIGN 32 -$L$sqr8x_copy_n:: - movq xmm0,QWORD PTR[rcx] - movq xmm1,QWORD PTR[8+rcx] - movq xmm3,QWORD PTR[16+rcx] - movq xmm4,QWORD PTR[24+rcx] - lea rcx,QWORD PTR[32+rcx] - movdqa XMMWORD PTR[r11],xmm0 - movdqa XMMWORD PTR[16+r11],xmm1 - movdqa XMMWORD PTR[32+r11],xmm3 - movdqa XMMWORD PTR[48+r11],xmm4 - lea r11,QWORD PTR[64+r11] - dec rbp - jnz $L$sqr8x_copy_n - +DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 + mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))] and eax,080100h cmp eax,080100h jne $L$sqr8x_nox call bn_sqrx8x_internal - pxor xmm0,xmm0 - lea rax,QWORD PTR[48+rsp] - lea rdx,QWORD PTR[64+r9*2+rsp] - shr r9,3+2 - mov rsi,QWORD PTR[40+rsp] - jmp $L$sqr8x_zero + + + + lea rbx,QWORD PTR[rcx*1+r8] + mov r9,rcx + mov rdx,rcx +DB 102,72,15,126,207 + sar rcx,3+2 + jmp $L$sqr8x_sub ALIGN 32 $L$sqr8x_nox:: call bn_sqr8x_internal + + + + lea rbx,QWORD PTR[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 +DB 102,72,15,126,207 + sar rcx,3+2 + jmp $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_sub:: + mov r12,QWORD PTR[rbx] + mov r13,QWORD PTR[8+rbx] + mov r14,QWORD PTR[16+rbx] + mov r15,QWORD PTR[24+rbx] + lea rbx,QWORD PTR[32+rbx] + sbb r12,QWORD PTR[rbp] + sbb r13,QWORD PTR[8+rbp] + sbb r14,QWORD PTR[16+rbp] + sbb r15,QWORD PTR[24+rbp] + lea rbp,QWORD PTR[32+rbp] + mov QWORD PTR[rdi],r12 + mov QWORD PTR[8+rdi],r13 + mov QWORD PTR[16+rdi],r14 + mov QWORD PTR[24+rdi],r15 + lea rdi,QWORD PTR[32+rdi] + inc rcx + jnz $L$sqr8x_sub + + sbb rax,0 + lea rbx,QWORD PTR[r9*1+rbx] + lea rdi,QWORD PTR[r9*1+rdi] + +DB 102,72,15,110,200 pxor xmm0,xmm0 - lea rax,QWORD PTR[48+rsp] - lea rdx,QWORD PTR[64+r9*2+rsp] - shr r9,3+2 + pshufd xmm1,xmm1,0 mov rsi,QWORD PTR[40+rsp] - jmp $L$sqr8x_zero + jmp $L$sqr8x_cond_copy ALIGN 32 -$L$sqr8x_zero:: - movdqa XMMWORD PTR[rax],xmm0 - movdqa XMMWORD PTR[16+rax],xmm0 - movdqa XMMWORD PTR[32+rax],xmm0 - movdqa XMMWORD PTR[48+rax],xmm0 - lea rax,QWORD PTR[64+rax] - movdqa XMMWORD PTR[rdx],xmm0 - movdqa XMMWORD PTR[16+rdx],xmm0 - movdqa XMMWORD PTR[32+rdx],xmm0 - movdqa XMMWORD PTR[48+rdx],xmm0 - lea rdx,QWORD PTR[64+rdx] - dec r9 - jnz $L$sqr8x_zero +$L$sqr8x_cond_copy:: + movdqa xmm2,XMMWORD PTR[rbx] + movdqa xmm3,XMMWORD PTR[16+rbx] + lea rbx,QWORD PTR[32+rbx] + movdqu xmm4,XMMWORD PTR[rdi] + movdqu xmm5,XMMWORD PTR[16+rdi] + lea rdi,QWORD PTR[32+rdi] + movdqa XMMWORD PTR[(-32)+rbx],xmm0 + movdqa XMMWORD PTR[(-16)+rbx],xmm0 + movdqa XMMWORD PTR[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD PTR[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD PTR[(-32)+rdi],xmm4 + movdqu XMMWORD PTR[(-16)+rdi],xmm5 + add r9,32 + jnz $L$sqr8x_cond_copy mov rax,1 mov r15,QWORD PTR[((-48))+rsi] @@ -1040,64 +1066,75 @@ $L$mulx4x_inner:: adc r15,rbp sub rbp,QWORD PTR[rbx] adc r14,r15 - mov r8,QWORD PTR[((-8))+rcx] sbb r15,r15 mov QWORD PTR[((-8))+rbx],r14 cmp rdi,QWORD PTR[16+rsp] jne $L$mulx4x_outer - sub r8,r14 - sbb r8,r8 - or r15,r8 - - neg rax - xor rdx,rdx + lea rbx,QWORD PTR[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 mov rdi,QWORD PTR[32+rsp] + jmp $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub:: + mov r11,QWORD PTR[rbx] + mov r12,QWORD PTR[8+rbx] + mov r13,QWORD PTR[16+rbx] + mov r14,QWORD PTR[24+rbx] + lea rbx,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[rcx] + sbb r12,QWORD PTR[8+rcx] + sbb r13,QWORD PTR[16+rcx] + sbb r14,QWORD PTR[24+rcx] + lea rcx,QWORD PTR[32+rcx] + mov QWORD PTR[rdi],r11 + mov QWORD PTR[8+rdi],r12 + mov QWORD PTR[16+rdi],r13 + mov QWORD PTR[24+rdi],r14 + lea rdi,QWORD PTR[32+rdi] + dec rax + jnz $L$mulx4x_sub + + sbb r15,0 lea rbx,QWORD PTR[64+rsp] + sub rdi,rdx +DB 102,73,15,110,207 pxor xmm0,xmm0 - mov r8,QWORD PTR[rax*1+rcx] - mov r9,QWORD PTR[8+rax*1+rcx] - neg r8 - jmp $L$mulx4x_sub_entry + pshufd xmm1,xmm1,0 + mov rsi,QWORD PTR[40+rsp] + jmp $L$mulx4x_cond_copy ALIGN 32 -$L$mulx4x_sub:: - mov r8,QWORD PTR[rax*1+rcx] - mov r9,QWORD PTR[8+rax*1+rcx] - not r8 -$L$mulx4x_sub_entry:: - mov r10,QWORD PTR[16+rax*1+rcx] - not r9 - and r8,r15 - mov r11,QWORD PTR[24+rax*1+rcx] - not r10 - and r9,r15 - not r11 - and r10,r15 - and r11,r15 - - neg rdx - adc r8,QWORD PTR[rbx] - adc r9,QWORD PTR[8+rbx] - movdqa XMMWORD PTR[rbx],xmm0 - adc r10,QWORD PTR[16+rbx] - adc r11,QWORD PTR[24+rbx] - movdqa XMMWORD PTR[16+rbx],xmm0 +$L$mulx4x_cond_copy:: + movdqa xmm2,XMMWORD PTR[rbx] + movdqa xmm3,XMMWORD PTR[16+rbx] lea rbx,QWORD PTR[32+rbx] - sbb rdx,rdx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 + movdqu xmm4,XMMWORD PTR[rdi] + movdqu xmm5,XMMWORD PTR[16+rdi] lea rdi,QWORD PTR[32+rdi] + movdqa XMMWORD PTR[(-32)+rbx],xmm0 + movdqa XMMWORD PTR[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD PTR[(-32)+rdi],xmm4 + movdqu XMMWORD PTR[(-16)+rdi],xmm5 + sub rdx,32 + jnz $L$mulx4x_cond_copy - add rax,32 - jnz $L$mulx4x_sub + mov QWORD PTR[rbx],rdx - mov rsi,QWORD PTR[40+rsp] mov rax,1 mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] diff --git a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm index eae2339ef1378d..080fb167848f84 100644 --- a/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm +++ b/deps/openssl/asm/x64-win32-masm/bn/x86_64-mont5.asm @@ -28,49 +28,151 @@ ALIGN 16 $L$mul_enter:: mov r9d,r9d mov rax,rsp - mov r10d,DWORD PTR[56+rsp] + movd xmm5,DWORD PTR[56+rsp] + lea r10,QWORD PTR[$L$inc] push rbx push rbp push r12 push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 + lea r11,QWORD PTR[2+r9] neg r11 - lea rsp,QWORD PTR[r11*8+rsp] + lea rsp,QWORD PTR[((-264))+r11*8+rsp] and rsp,-1024 mov QWORD PTR[8+r9*8+rsp],rax $L$mul_body:: - mov r12,rdx - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,QWORD PTR[$L$magic_masks] - and r10,3 - lea r12,QWORD PTR[96+r11*8+r12] - movq xmm4,QWORD PTR[r10*8+rax] - movq xmm5,QWORD PTR[8+r10*8+rax] - movq xmm6,QWORD PTR[16+r10*8+rax] - movq xmm7,QWORD PTR[24+r10*8+rax] - - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 + lea r12,QWORD PTR[128+rdx] + movdqa xmm0,XMMWORD PTR[r10] + movdqa xmm1,XMMWORD PTR[16+r10] + lea r10,QWORD PTR[((24-112))+r9*8+rsp] + and r10,-16 + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 +DB 067h + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[304+r10],xmm0 + + paddd xmm3,xmm2 +DB 067h + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[336+r10],xmm2 + pand xmm0,XMMWORD PTR[64+r12] + + pand xmm1,XMMWORD PTR[80+r12] + pand xmm2,XMMWORD PTR[96+r12] + movdqa XMMWORD PTR[352+r10],xmm3 + pand xmm3,XMMWORD PTR[112+r12] por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-128))+r12] + movdqa xmm5,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + pand xmm4,XMMWORD PTR[112+r10] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm5,XMMWORD PTR[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-64))+r12] + movdqa xmm5,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + pand xmm4,XMMWORD PTR[176+r10] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm5,XMMWORD PTR[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[r12] + movdqa xmm5,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + pand xmm4,XMMWORD PTR[240+r10] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm5,XMMWORD PTR[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - DB 102,72,15,126,195 mov r8,QWORD PTR[r8] @@ -79,29 +181,14 @@ DB 102,72,15,126,195 xor r14,r14 xor r15,r15 - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD PTR[rcx] - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD PTR[8+rsi] @@ -134,14 +221,12 @@ $L$1st_enter:: cmp r15,r9 jne $L$1st -DB 102,72,15,126,195 add r13,rax - mov rax,QWORD PTR[rsi] adc rdx,0 add r13,r11 adc rdx,0 - mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov QWORD PTR[((-16))+r9*8+rsp],r13 mov r13,rdx mov r11,r10 @@ -155,33 +240,78 @@ DB 102,72,15,126,195 jmp $L$outer ALIGN 16 $L$outer:: + lea rdx,QWORD PTR[((24+128))+r9*8+rsp] + and rdx,-16 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r12] + movdqa xmm1,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm0,XMMWORD PTR[((-128))+rdx] + pand xmm1,XMMWORD PTR[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r12] + movdqa xmm1,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm0,XMMWORD PTR[((-64))+rdx] + pand xmm1,XMMWORD PTR[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r12] + movdqa xmm1,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm0,XMMWORD PTR[rdx] + pand xmm1,XMMWORD PTR[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r12] + movdqa xmm1,XMMWORD PTR[80+r12] + movdqa xmm2,XMMWORD PTR[96+r12] + movdqa xmm3,XMMWORD PTR[112+r12] + pand xmm0,XMMWORD PTR[64+rdx] + pand xmm1,XMMWORD PTR[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 + lea r12,QWORD PTR[256+r12] + + mov rax,QWORD PTR[rsi] +DB 102,72,15,126,195 + xor r15,r15 mov rbp,r8 mov r10,QWORD PTR[rsp] - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - mul rbx add r10,rax mov rax,QWORD PTR[rcx] adc rdx,0 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD PTR[8+rsi] @@ -217,15 +347,12 @@ $L$inner_enter:: cmp r15,r9 jne $L$inner -DB 102,72,15,126,195 - add r13,rax - mov rax,QWORD PTR[rsi] adc rdx,0 add r13,r10 - mov r10,QWORD PTR[r15*8+rsp] + mov r10,QWORD PTR[r9*8+rsp] adc rdx,0 - mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov QWORD PTR[((-16))+r9*8+rsp],r13 mov r13,rdx xor rdx,rdx @@ -272,8 +399,7 @@ $L$copy:: mov rsi,QWORD PTR[8+r9*8+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -303,8 +429,8 @@ $L$SEH_begin_bn_mul4x_mont_gather5:: $L$mul4x_enter:: - and r11d,080100h - cmp r11d,080100h + and r11d,080108h + cmp r11d,080108h je $L$mulx4x_enter DB 067h mov rax,rsp @@ -314,13 +440,10 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 + DB 067h - mov r10d,r9d shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 @@ -330,19 +453,21 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$mul4xsp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -358,8 +483,7 @@ $L$mul4x_body:: mov rsi,QWORD PTR[40+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -378,47 +502,141 @@ bn_mul4x_mont_gather5 ENDP ALIGN 32 mul4x_internal PROC PRIVATE shl r9,5 - mov r10d,DWORD PTR[56+rax] - lea r13,QWORD PTR[256+r9*1+rdx] + movd xmm5,DWORD PTR[56+rax] + lea rax,QWORD PTR[$L$inc] + lea r13,QWORD PTR[128+r9*1+rdx] shr r9,5 - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,QWORD PTR[$L$magic_masks] - and r10,3 - lea r12,QWORD PTR[96+r11*8+rdx] - movq xmm4,QWORD PTR[r10*8+rax] - movq xmm5,QWORD PTR[8+r10*8+rax] - add r11,7 - movq xmm6,QWORD PTR[16+r10*8+rax] - movq xmm7,QWORD PTR[24+r10*8+rax] - and r11,7 - - movq xmm0,QWORD PTR[((-96))+r12] - lea r14,QWORD PTR[256+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 -DB 067h - por xmm0,xmm1 - movq xmm1,QWORD PTR[((-96))+r14] -DB 067h - pand xmm3,xmm7 -DB 067h - por xmm0,xmm2 - movq xmm2,QWORD PTR[((-32))+r14] + movdqa xmm0,XMMWORD PTR[rax] + movdqa xmm1,XMMWORD PTR[16+rax] + lea r10,QWORD PTR[((88-112))+r9*1+rsp] + lea r12,QWORD PTR[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 067h,067h + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 DB 067h - pand xmm1,xmm4 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[304+r10],xmm0 + + paddd xmm3,xmm2 DB 067h - por xmm0,xmm3 - movq xmm3,QWORD PTR[32+r14] + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[336+r10],xmm2 + pand xmm0,XMMWORD PTR[64+r12] + pand xmm1,XMMWORD PTR[80+r12] + pand xmm2,XMMWORD PTR[96+r12] + movdqa XMMWORD PTR[352+r10],xmm3 + pand xmm3,XMMWORD PTR[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-128))+r12] + movdqa xmm5,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + pand xmm4,XMMWORD PTR[112+r10] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm5,XMMWORD PTR[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-64))+r12] + movdqa xmm5,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + pand xmm4,XMMWORD PTR[176+r10] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm5,XMMWORD PTR[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[r12] + movdqa xmm5,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + pand xmm4,XMMWORD PTR[240+r10] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm5,XMMWORD PTR[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 + lea r12,QWORD PTR[256+r12] DB 102,72,15,126,195 - movq xmm0,QWORD PTR[96+r14] + mov QWORD PTR[((16+8))+rsp],r13 mov QWORD PTR[((56+8))+rsp],rdi @@ -432,26 +650,10 @@ DB 102,72,15,126,195 mov r10,rax mov rax,QWORD PTR[rcx] - pand xmm2,xmm5 - pand xmm3,xmm6 - por xmm1,xmm2 - imul rbp,r10 - - - - - - - - lea r14,QWORD PTR[((64+8))+r11*8+rsp] + lea r14,QWORD PTR[((64+8))+rsp] mov r11,rdx - pand xmm0,xmm7 - por xmm1,xmm3 - lea r12,QWORD PTR[512+r12] - por xmm0,xmm1 - mul rbp add r10,rax mov rax,QWORD PTR[8+r9*1+rsi] @@ -460,7 +662,7 @@ DB 102,72,15,126,195 mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 mov r10,rdx @@ -470,7 +672,7 @@ DB 102,72,15,126,195 adc rdx,0 add rdi,r11 lea r15,QWORD PTR[32+r9] - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[r14],rdi mov r13,rdx @@ -480,7 +682,7 @@ ALIGN 32 $L$1st4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] lea r14,QWORD PTR[32+r14] adc rdx,0 mov r11,rdx @@ -496,7 +698,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -526,7 +728,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 mov r10,rdx @@ -535,7 +737,7 @@ $L$1st4x:: mov rax,QWORD PTR[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[r14],rdi mov r13,rdx @@ -545,7 +747,7 @@ $L$1st4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] lea r14,QWORD PTR[32+r14] adc rdx,0 mov r11,rdx @@ -561,7 +763,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -574,8 +776,7 @@ $L$1st4x:: mov QWORD PTR[((-16))+r14],rdi mov r13,rdx -DB 102,72,15,126,195 - lea rcx,QWORD PTR[r9*2+rcx] + lea rcx,QWORD PTR[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -586,6 +787,63 @@ DB 102,72,15,126,195 ALIGN 32 $L$outer4x:: + lea rdx,QWORD PTR[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r12] + movdqa xmm1,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm0,XMMWORD PTR[((-128))+rdx] + pand xmm1,XMMWORD PTR[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r12] + movdqa xmm1,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm0,XMMWORD PTR[((-64))+rdx] + pand xmm1,XMMWORD PTR[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r12] + movdqa xmm1,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm0,XMMWORD PTR[rdx] + pand xmm1,XMMWORD PTR[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r12] + movdqa xmm1,XMMWORD PTR[80+r12] + movdqa xmm2,XMMWORD PTR[96+r12] + movdqa xmm3,XMMWORD PTR[112+r12] + pand xmm0,XMMWORD PTR[64+rdx] + pand xmm1,XMMWORD PTR[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 + lea r12,QWORD PTR[256+r12] +DB 102,72,15,126,195 + mov r10,QWORD PTR[r9*1+r14] mov rbp,r8 mul rbx @@ -593,25 +851,11 @@ $L$outer4x:: mov rax,QWORD PTR[rcx] adc rdx,0 - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - imul rbp,r10 -DB 067h mov r11,rdx mov QWORD PTR[r14],rdi - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - por xmm0,xmm2 lea r14,QWORD PTR[r9*1+r14] - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 mul rbp add r10,rax @@ -621,7 +865,7 @@ DB 067h mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 add r11,QWORD PTR[8+r14] adc rdx,0 @@ -633,7 +877,7 @@ DB 067h adc rdx,0 add rdi,r11 lea r15,QWORD PTR[32+r9] - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov r13,rdx jmp $L$inner4x @@ -642,7 +886,7 @@ ALIGN 32 $L$inner4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] adc rdx,0 add r10,QWORD PTR[16+r14] lea r14,QWORD PTR[32+r14] @@ -660,7 +904,7 @@ $L$inner4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 add r11,QWORD PTR[((-8))+r14] adc rdx,0 @@ -694,7 +938,7 @@ $L$inner4x:: mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 add r11,QWORD PTR[8+r14] adc rdx,0 @@ -705,7 +949,7 @@ $L$inner4x:: mov rax,QWORD PTR[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[((-8))+r14],r13 mov r13,rdx @@ -715,7 +959,7 @@ $L$inner4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] adc rdx,0 add r10,QWORD PTR[16+r14] lea r14,QWORD PTR[32+r14] @@ -734,7 +978,7 @@ $L$inner4x:: mul rbx add r11,rax mov rax,rbp - mov rbp,QWORD PTR[((-16))+rcx] + mov rbp,QWORD PTR[((-8))+rcx] adc rdx,0 add r11,QWORD PTR[((-8))+r14] adc rdx,0 @@ -749,9 +993,8 @@ $L$inner4x:: mov QWORD PTR[((-24))+r14],r13 mov r13,rdx -DB 102,72,15,126,195 mov QWORD PTR[((-16))+r14],rdi - lea rcx,QWORD PTR[r9*2+rcx] + lea rcx,QWORD PTR[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -762,16 +1005,23 @@ DB 102,72,15,126,195 cmp r12,QWORD PTR[((16+8))+rsp] jb $L$outer4x + xor rax,rax sub rbp,r13 adc r15,r15 or rdi,r15 - xor rdi,1 + sub rax,rdi lea rbx,QWORD PTR[r9*1+r14] - lea rbp,QWORD PTR[rdi*8+rcx] + mov r12,QWORD PTR[rcx] + lea rbp,QWORD PTR[rcx] mov rcx,r9 sar rcx,3+2 mov rdi,QWORD PTR[((56+8))+rsp] - jmp $L$sqr4x_sub + dec r12 + xor r10,r10 + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqr4x_sub_entry mul4x_internal ENDP PUBLIC bn_power5 @@ -790,8 +1040,8 @@ $L$SEH_begin_bn_power5:: mov r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))] - and r11d,080100h - cmp r11d,080100h + and r11d,080108h + cmp r11d,080108h je $L$powerx5_enter mov rax,rsp push rbx @@ -800,12 +1050,9 @@ $L$SEH_begin_bn_power5:: push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10d,DWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -815,19 +1062,20 @@ $L$SEH_begin_bn_power5:: - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$pwr_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -855,10 +1103,15 @@ DB 102,73,15,110,218 DB 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal DB 102,72,15,126,209 DB 102,72,15,126,226 @@ -1405,9 +1658,9 @@ DB 067h mov QWORD PTR[((-16))+rdi],rbx mov QWORD PTR[((-8))+rdi],r8 DB 102,72,15,126,213 -sqr8x_reduction:: +__bn_sqr8x_reduction:: xor rax,rax - lea rcx,QWORD PTR[r9*2+rbp] + lea rcx,QWORD PTR[rbp*1+r9] lea rdx,QWORD PTR[((48+8))+r9*2+rsp] mov QWORD PTR[((0+8))+rsp],rcx lea rdi,QWORD PTR[((48+8))+r9*1+rsp] @@ -1440,14 +1693,14 @@ DB 067h ALIGN 32 $L$8x_reduce:: mul rbx - mov rax,QWORD PTR[16+rbp] + mov rax,QWORD PTR[8+rbp] neg r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD PTR[32+rbp] + mov rax,QWORD PTR[16+rbp] adc rdx,0 add r8,r9 mov QWORD PTR[((48-8+8))+rcx*8+rsp],rbx @@ -1456,7 +1709,7 @@ $L$8x_reduce:: mul rbx add r10,rax - mov rax,QWORD PTR[48+rbp] + mov rax,QWORD PTR[24+rbp] adc rdx,0 add r9,r10 mov rsi,QWORD PTR[((32+8))+rsp] @@ -1465,7 +1718,7 @@ $L$8x_reduce:: mul rbx add r11,rax - mov rax,QWORD PTR[64+rbp] + mov rax,QWORD PTR[32+rbp] adc rdx,0 imul rsi,r8 add r10,r11 @@ -1474,7 +1727,7 @@ $L$8x_reduce:: mul rbx add r12,rax - mov rax,QWORD PTR[80+rbp] + mov rax,QWORD PTR[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1482,7 +1735,7 @@ $L$8x_reduce:: mul rbx add r13,rax - mov rax,QWORD PTR[96+rbp] + mov rax,QWORD PTR[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1490,7 +1743,7 @@ $L$8x_reduce:: mul rbx add r14,rax - mov rax,QWORD PTR[112+rbp] + mov rax,QWORD PTR[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1508,7 +1761,7 @@ $L$8x_reduce:: dec ecx jnz $L$8x_reduce - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] xor rax,rax mov rdx,QWORD PTR[((8+8))+rsp] cmp rbp,QWORD PTR[((0+8))+rsp] @@ -1534,14 +1787,14 @@ ALIGN 32 $L$8x_tail:: mul rbx add r8,rax - mov rax,QWORD PTR[16+rbp] + mov rax,QWORD PTR[8+rbp] mov QWORD PTR[rdi],r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD PTR[32+rbp] + mov rax,QWORD PTR[16+rbp] adc rdx,0 add r8,r9 lea rdi,QWORD PTR[8+rdi] @@ -1550,7 +1803,7 @@ $L$8x_tail:: mul rbx add r10,rax - mov rax,QWORD PTR[48+rbp] + mov rax,QWORD PTR[24+rbp] adc rdx,0 add r9,r10 mov r10,rdx @@ -1558,7 +1811,7 @@ $L$8x_tail:: mul rbx add r11,rax - mov rax,QWORD PTR[64+rbp] + mov rax,QWORD PTR[32+rbp] adc rdx,0 add r10,r11 mov r11,rdx @@ -1566,7 +1819,7 @@ $L$8x_tail:: mul rbx add r12,rax - mov rax,QWORD PTR[80+rbp] + mov rax,QWORD PTR[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1574,7 +1827,7 @@ $L$8x_tail:: mul rbx add r13,rax - mov rax,QWORD PTR[96+rbp] + mov rax,QWORD PTR[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1582,7 +1835,7 @@ $L$8x_tail:: mul rbx add r14,rax - mov rax,QWORD PTR[112+rbp] + mov rax,QWORD PTR[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1600,7 +1853,7 @@ $L$8x_tail:: dec ecx jnz $L$8x_tail - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] mov rdx,QWORD PTR[((8+8))+rsp] cmp rbp,QWORD PTR[((0+8))+rsp] jae $L$8x_tail_done @@ -1646,7 +1899,7 @@ $L$8x_no_tail:: adc r14,QWORD PTR[48+rdi] adc r15,QWORD PTR[56+rdi] adc rax,0 - mov rcx,QWORD PTR[((-16))+rbp] + mov rcx,QWORD PTR[((-8))+rbp] xor rsi,rsi DB 102,72,15,126,213 @@ -1664,44 +1917,62 @@ DB 102,73,15,126,217 cmp rdi,rdx jb $L$8x_reduction_loop + DB 0F3h,0C3h ;repret +bn_sqr8x_internal ENDP - sub rcx,r15 +ALIGN 32 +__bn_post4x_internal PROC PRIVATE + mov r12,QWORD PTR[rbp] lea rbx,QWORD PTR[r9*1+rdi] - adc rsi,rsi mov rcx,r9 - or rax,rsi DB 102,72,15,126,207 - xor rax,1 + neg rax DB 102,72,15,126,206 - lea rbp,QWORD PTR[rax*8+rbp] sar rcx,3+2 - jmp $L$sqr4x_sub + dec r12 + xor r10,r10 + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqr4x_sub_entry -ALIGN 32 +ALIGN 16 $L$sqr4x_sub:: -DB 066h - mov r12,QWORD PTR[rbx] - mov r13,QWORD PTR[8+rbx] - sbb r12,QWORD PTR[rbp] - mov r14,QWORD PTR[16+rbx] - sbb r13,QWORD PTR[16+rbp] - mov r15,QWORD PTR[24+rbx] - lea rbx,QWORD PTR[32+rbx] - sbb r14,QWORD PTR[32+rbp] + mov r12,QWORD PTR[rbp] + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] +$L$sqr4x_sub_entry:: + lea rbp,QWORD PTR[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD PTR[rbx] + adc r13,QWORD PTR[8+rbx] + adc r14,QWORD PTR[16+rbx] + adc r15,QWORD PTR[24+rbx] mov QWORD PTR[rdi],r12 - sbb r15,QWORD PTR[48+rbp] - lea rbp,QWORD PTR[64+rbp] + lea rbx,QWORD PTR[32+rbx] mov QWORD PTR[8+rdi],r13 + sbb r10,r10 mov QWORD PTR[16+rdi],r14 mov QWORD PTR[24+rdi],r15 lea rdi,QWORD PTR[32+rdi] inc rcx jnz $L$sqr4x_sub + mov r10,r9 neg r9 DB 0F3h,0C3h ;repret -bn_sqr8x_internal ENDP +__bn_post4x_internal ENDP PUBLIC bn_from_montgomery ALIGN 32 @@ -1735,13 +2006,9 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 -DB 067h - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -1751,19 +2018,20 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$from_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$from_sp_done ALIGN 32 $L$from_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -1815,12 +2083,13 @@ DB 067h mov rbp,rcx DB 102,73,15,110,218 mov r11d,DWORD PTR[((OPENSSL_ia32cap_P+8))] - and r11d,080100h - cmp r11d,080100h + and r11d,080108h + cmp r11d,080108h jne $L$from_mont_nox lea rdi,QWORD PTR[r9*1+rax] - call sqrx8x_reduction + call __bn_sqrx8x_reduction + call __bn_postx4x_internal pxor xmm0,xmm0 lea rax,QWORD PTR[48+rsp] @@ -1829,7 +2098,8 @@ DB 102,73,15,110,218 ALIGN 32 $L$from_mont_nox:: - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor xmm0,xmm0 lea rax,QWORD PTR[48+rsp] @@ -1876,7 +2146,6 @@ $L$SEH_begin_bn_mulx4x_mont_gather5:: $L$mulx4x_enter:: -DB 067h mov rax,rsp push rbx push rbp @@ -1884,13 +2153,9 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 -DB 067h - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -1901,19 +2166,20 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$mulx4xsp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$mulx4xsp_done -ALIGN 32 $L$mulx4xsp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -1939,8 +2205,7 @@ $L$mulx4x_body:: mov rsi,QWORD PTR[40+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -1958,63 +2223,150 @@ bn_mulx4x_mont_gather5 ENDP ALIGN 32 mulx4x_internal PROC PRIVATE -DB 04ch,089h,08ch,024h,008h,000h,000h,000h -DB 067h + mov QWORD PTR[8+rsp],r9 + mov r10,r9 neg r9 shl r9,5 - lea r13,QWORD PTR[256+r9*1+rdx] + neg r10 + lea r13,QWORD PTR[128+r9*1+rdx] shr r9,5+5 - mov r10d,DWORD PTR[56+rax] + movd xmm5,DWORD PTR[56+rax] sub r9,1 + lea rax,QWORD PTR[$L$inc] mov QWORD PTR[((16+8))+rsp],r13 mov QWORD PTR[((24+8))+rsp],r9 mov QWORD PTR[((56+8))+rsp],rdi - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,QWORD PTR[$L$magic_masks] - and r10,3 - lea rdi,QWORD PTR[96+r11*8+rdx] - movq xmm4,QWORD PTR[r10*8+rax] - movq xmm5,QWORD PTR[8+r10*8+rax] - add r11,7 - movq xmm6,QWORD PTR[16+r10*8+rax] - movq xmm7,QWORD PTR[24+r10*8+rax] - and r11,7 - - movq xmm0,QWORD PTR[((-96))+rdi] - lea rbx,QWORD PTR[256+rdi] - movq xmm1,QWORD PTR[((-32))+rdi] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+rdi] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+rdi] - pand xmm2,xmm6 - por xmm0,xmm1 - movq xmm1,QWORD PTR[((-96))+rbx] - pand xmm3,xmm7 - por xmm0,xmm2 - movq xmm2,QWORD PTR[((-32))+rbx] - por xmm0,xmm3 -DB 067h,067h - pand xmm1,xmm4 - movq xmm3,QWORD PTR[32+rbx] + movdqa xmm0,XMMWORD PTR[rax] + movdqa xmm1,XMMWORD PTR[16+rax] + lea r10,QWORD PTR[((88-112))+r10*1+rsp] + lea rdi,QWORD PTR[128+rdx] + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 067h + movdqa xmm2,xmm1 +DB 067h + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[288+r10],xmm3 + movdqa xmm3,xmm4 +DB 067h + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[336+r10],xmm2 + + pand xmm0,XMMWORD PTR[64+rdi] + pand xmm1,XMMWORD PTR[80+rdi] + pand xmm2,XMMWORD PTR[96+rdi] + movdqa XMMWORD PTR[352+r10],xmm3 + pand xmm3,XMMWORD PTR[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-128))+rdi] + movdqa xmm5,XMMWORD PTR[((-112))+rdi] + movdqa xmm2,XMMWORD PTR[((-96))+rdi] + pand xmm4,XMMWORD PTR[112+r10] + movdqa xmm3,XMMWORD PTR[((-80))+rdi] + pand xmm5,XMMWORD PTR[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-64))+rdi] + movdqa xmm5,XMMWORD PTR[((-48))+rdi] + movdqa xmm2,XMMWORD PTR[((-32))+rdi] + pand xmm4,XMMWORD PTR[176+r10] + movdqa xmm3,XMMWORD PTR[((-16))+rdi] + pand xmm5,XMMWORD PTR[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[rdi] + movdqa xmm5,XMMWORD PTR[16+rdi] + movdqa xmm2,XMMWORD PTR[32+rdi] + pand xmm4,XMMWORD PTR[240+r10] + movdqa xmm3,XMMWORD PTR[48+rdi] + pand xmm5,XMMWORD PTR[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 + lea rdi,QWORD PTR[256+rdi] DB 102,72,15,126,194 - movq xmm0,QWORD PTR[96+rbx] - lea rdi,QWORD PTR[512+rdi] - pand xmm2,xmm5 -DB 067h,067h - pand xmm3,xmm6 - - - - - - - - lea rbx,QWORD PTR[((64+32+8))+r11*8+rsp] + lea rbx,QWORD PTR[((64+32+8))+rsp] mov r9,rdx mulx rax,r8,QWORD PTR[rsi] @@ -2030,37 +2382,31 @@ DB 067h,067h xor rbp,rbp mov rdx,r8 - por xmm1,xmm2 - pand xmm0,xmm7 - por xmm1,xmm3 mov QWORD PTR[((8+8))+rsp],rdi - por xmm0,xmm1 -DB 048h,08dh,0b6h,020h,000h,000h,000h + lea rsi,QWORD PTR[32+rsi] adcx r13,rax adcx r14,rbp mulx r10,rax,QWORD PTR[rcx] adcx r15,rax adox r10,r11 - mulx r11,rax,QWORD PTR[16+rcx] + mulx r11,rax,QWORD PTR[8+rcx] adcx r10,rax adox r11,r12 - mulx r12,rax,QWORD PTR[32+rcx] + mulx r12,rax,QWORD PTR[16+rcx] mov rdi,QWORD PTR[((24+8))+rsp] -DB 066h mov QWORD PTR[((-32))+rbx],r10 adcx r11,rax adox r12,r13 - mulx r15,rax,QWORD PTR[48+rcx] -DB 067h,067h + mulx r15,rax,QWORD PTR[24+rcx] mov rdx,r9 mov QWORD PTR[((-24))+rbx],r11 adcx r12,rax adox r15,rbp -DB 048h,08dh,089h,040h,000h,000h,000h + lea rcx,QWORD PTR[32+rcx] mov QWORD PTR[((-16))+rbx],r12 - + jmp $L$mulx4x_1st ALIGN 32 $L$mulx4x_1st:: @@ -2083,27 +2429,26 @@ DB 067h,067h mulx r15,rax,QWORD PTR[rcx] adcx r10,rax adox r11,r15 - mulx r15,rax,QWORD PTR[16+rcx] + mulx r15,rax,QWORD PTR[8+rcx] adcx r11,rax adox r12,r15 - mulx r15,rax,QWORD PTR[32+rcx] + mulx r15,rax,QWORD PTR[16+rcx] mov QWORD PTR[((-40))+rbx],r10 adcx r12,rax mov QWORD PTR[((-32))+rbx],r11 adox r13,r15 - mulx r15,rax,QWORD PTR[48+rcx] + mulx r15,rax,QWORD PTR[24+rcx] mov rdx,r9 mov QWORD PTR[((-24))+rbx],r12 adcx r13,rax adox r15,rbp - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] mov QWORD PTR[((-16))+rbx],r13 dec rdi jnz $L$mulx4x_1st mov rax,QWORD PTR[8+rsp] -DB 102,72,15,126,194 adc r15,rbp lea rsi,QWORD PTR[rax*1+rsi] add r14,r15 @@ -2114,6 +2459,64 @@ DB 102,72,15,126,194 ALIGN 32 $L$mulx4x_outer:: + lea r10,QWORD PTR[((16-256))+rbx] + pxor xmm4,xmm4 +DB 067h,067h + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+rdi] + movdqa xmm1,XMMWORD PTR[((-112))+rdi] + movdqa xmm2,XMMWORD PTR[((-96))+rdi] + pand xmm0,XMMWORD PTR[256+r10] + movdqa xmm3,XMMWORD PTR[((-80))+rdi] + pand xmm1,XMMWORD PTR[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+rdi] + movdqa xmm1,XMMWORD PTR[((-48))+rdi] + movdqa xmm2,XMMWORD PTR[((-32))+rdi] + pand xmm0,XMMWORD PTR[320+r10] + movdqa xmm3,XMMWORD PTR[((-16))+rdi] + pand xmm1,XMMWORD PTR[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[rdi] + movdqa xmm1,XMMWORD PTR[16+rdi] + movdqa xmm2,XMMWORD PTR[32+rdi] + pand xmm0,XMMWORD PTR[384+r10] + movdqa xmm3,XMMWORD PTR[48+rdi] + pand xmm1,XMMWORD PTR[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+rdi] + movdqa xmm1,XMMWORD PTR[80+rdi] + movdqa xmm2,XMMWORD PTR[96+rdi] + pand xmm0,XMMWORD PTR[448+r10] + movdqa xmm3,XMMWORD PTR[112+rdi] + pand xmm1,XMMWORD PTR[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 + lea rdi,QWORD PTR[256+rdi] +DB 102,72,15,126,194 + mov QWORD PTR[rbx],rbp lea rbx,QWORD PTR[32+rax*1+rbx] mulx r11,r8,QWORD PTR[rsi] @@ -2128,54 +2531,37 @@ $L$mulx4x_outer:: mulx r14,rdx,QWORD PTR[24+rsi] adox r12,QWORD PTR[((-16))+rbx] adcx r13,rdx - lea rcx,QWORD PTR[rax*2+rcx] + lea rcx,QWORD PTR[rax*1+rcx] lea rsi,QWORD PTR[32+rsi] adox r13,QWORD PTR[((-8))+rbx] adcx r14,rbp adox r14,rbp -DB 067h mov r15,r8 imul r8,QWORD PTR[((32+8))+rsp] - movq xmm0,QWORD PTR[((-96))+rdi] -DB 067h,067h mov rdx,r8 - movq xmm1,QWORD PTR[((-32))+rdi] -DB 067h - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+rdi] -DB 067h - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+rdi] - add rdi,256 -DB 067h - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 xor rbp,rbp mov QWORD PTR[((8+8))+rsp],rdi mulx r10,rax,QWORD PTR[rcx] adcx r15,rax adox r10,r11 - mulx r11,rax,QWORD PTR[16+rcx] + mulx r11,rax,QWORD PTR[8+rcx] adcx r10,rax adox r11,r12 - mulx r12,rax,QWORD PTR[32+rcx] + mulx r12,rax,QWORD PTR[16+rcx] adcx r11,rax adox r12,r13 - mulx r15,rax,QWORD PTR[48+rcx] + mulx r15,rax,QWORD PTR[24+rcx] mov rdx,r9 - por xmm0,xmm2 mov rdi,QWORD PTR[((24+8))+rsp] mov QWORD PTR[((-32))+rbx],r10 - por xmm0,xmm3 adcx r12,rax mov QWORD PTR[((-24))+rbx],r11 adox r15,rbp mov QWORD PTR[((-16))+rbx],r12 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] jmp $L$mulx4x_inner ALIGN 32 @@ -2203,17 +2589,17 @@ $L$mulx4x_inner:: mulx r15,rax,QWORD PTR[rcx] adcx r10,rax adox r11,r15 - mulx r15,rax,QWORD PTR[16+rcx] + mulx r15,rax,QWORD PTR[8+rcx] adcx r11,rax adox r12,r15 - mulx r15,rax,QWORD PTR[32+rcx] + mulx r15,rax,QWORD PTR[16+rcx] mov QWORD PTR[((-40))+rbx],r10 adcx r12,rax adox r13,r15 mov QWORD PTR[((-32))+rbx],r11 - mulx r15,rax,QWORD PTR[48+rcx] + mulx r15,rax,QWORD PTR[24+rcx] mov rdx,r9 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] mov QWORD PTR[((-24))+rbx],r12 adcx r13,rax adox r15,rbp @@ -2223,7 +2609,6 @@ $L$mulx4x_inner:: jnz $L$mulx4x_inner mov rax,QWORD PTR[((0+8))+rsp] -DB 102,72,15,126,194 adc r15,rbp sub rdi,QWORD PTR[rbx] mov rdi,QWORD PTR[((8+8))+rsp] @@ -2236,20 +2621,26 @@ DB 102,72,15,126,194 cmp rdi,r10 jb $L$mulx4x_outer - mov r10,QWORD PTR[((-16))+rcx] + mov r10,QWORD PTR[((-8))+rcx] + mov r8,rbp + mov r12,QWORD PTR[rax*1+rcx] + lea rbp,QWORD PTR[rax*1+rcx] + mov rcx,rax + lea rdi,QWORD PTR[rax*1+rbx] + xor eax,eax xor r15,r15 sub r10,r14 adc r15,r15 - or rbp,r15 - xor rbp,1 - lea rdi,QWORD PTR[rax*1+rbx] - lea rcx,QWORD PTR[rax*2+rcx] -DB 067h,067h - sar rax,3+2 - lea rbp,QWORD PTR[rbp*8+rcx] + or r8,r15 + sar rcx,3+2 + sub rax,r8 mov rdx,QWORD PTR[((56+8))+rsp] - mov rcx,rax - jmp $L$sqrx4x_sub + dec r12 + mov r13,QWORD PTR[8+rbp] + xor r8,r8 + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqrx4x_sub_entry mulx4x_internal ENDP ALIGN 32 @@ -2267,7 +2658,6 @@ $L$SEH_begin_bn_powerx5:: $L$powerx5_enter:: -DB 067h mov rax,rsp push rbx push rbp @@ -2275,13 +2665,9 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 -DB 067h - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -2291,19 +2677,20 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$pwrx_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$pwrx_sp_done ALIGN 32 $L$pwrx_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -2334,10 +2721,15 @@ DB 102,72,15,110,226 $L$powerx5_body:: call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal mov r9,r10 mov rdi,rsi @@ -2349,8 +2741,7 @@ DB 102,72,15,126,226 mov rsi,QWORD PTR[40+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -2766,11 +3157,11 @@ $L$sqrx4x_shift_n_add_break:: mov QWORD PTR[56+rdi],rbx lea rdi,QWORD PTR[64+rdi] DB 102,72,15,126,213 -sqrx8x_reduction:: +__bn_sqrx8x_reduction:: xor eax,eax mov rbx,QWORD PTR[((32+8))+rsp] mov rdx,QWORD PTR[((48+8))+rsp] - lea rcx,QWORD PTR[((-128))+r9*2+rbp] + lea rcx,QWORD PTR[((-64))+r9*1+rbp] mov QWORD PTR[((0+8))+rsp],rcx mov QWORD PTR[((8+8))+rsp],rdi @@ -2803,19 +3194,19 @@ $L$sqrx8x_reduce:: adcx rax,rbx adox r8,r9 - mulx r9,rbx,QWORD PTR[16+rbp] + mulx r9,rbx,QWORD PTR[8+rbp] adcx r8,rbx adox r9,r10 - mulx r10,rbx,QWORD PTR[32+rbp] + mulx r10,rbx,QWORD PTR[16+rbp] adcx r9,rbx adox r10,r11 - mulx r11,rbx,QWORD PTR[48+rbp] + mulx r11,rbx,QWORD PTR[24+rbp] adcx r10,rbx adox r11,r12 -DB 0c4h,062h,0e3h,0f6h,0a5h,040h,000h,000h,000h +DB 0c4h,062h,0e3h,0f6h,0a5h,020h,000h,000h,000h mov rax,rdx mov rdx,r8 adcx r11,rbx @@ -2825,15 +3216,15 @@ DB 0c4h,062h,0e3h,0f6h,0a5h,040h,000h,000h,000h mov rdx,rax mov QWORD PTR[((64+48+8))+rcx*8+rsp],rax - mulx r13,rax,QWORD PTR[80+rbp] + mulx r13,rax,QWORD PTR[40+rbp] adcx r12,rax adox r13,r14 - mulx r14,rax,QWORD PTR[96+rbp] + mulx r14,rax,QWORD PTR[48+rbp] adcx r13,rax adox r14,r15 - mulx r15,rax,QWORD PTR[112+rbp] + mulx r15,rax,QWORD PTR[56+rbp] mov rdx,rbx adcx r14,rax adox r15,rsi @@ -2849,7 +3240,7 @@ DB 067h,067h,067h mov rdx,QWORD PTR[((48+8))+rsp] add r8,QWORD PTR[rdi] - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] mov rcx,-8 adcx r9,QWORD PTR[8+rdi] adcx r10,QWORD PTR[16+rdi] @@ -2872,31 +3263,31 @@ $L$sqrx8x_tail:: adcx rbx,rax adox r8,r9 - mulx r9,rax,QWORD PTR[16+rbp] + mulx r9,rax,QWORD PTR[8+rbp] adcx r8,rax adox r9,r10 - mulx r10,rax,QWORD PTR[32+rbp] + mulx r10,rax,QWORD PTR[16+rbp] adcx r9,rax adox r10,r11 - mulx r11,rax,QWORD PTR[48+rbp] + mulx r11,rax,QWORD PTR[24+rbp] adcx r10,rax adox r11,r12 -DB 0c4h,062h,0fbh,0f6h,0a5h,040h,000h,000h,000h +DB 0c4h,062h,0fbh,0f6h,0a5h,020h,000h,000h,000h adcx r11,rax adox r12,r13 - mulx r13,rax,QWORD PTR[80+rbp] + mulx r13,rax,QWORD PTR[40+rbp] adcx r12,rax adox r13,r14 - mulx r14,rax,QWORD PTR[96+rbp] + mulx r14,rax,QWORD PTR[48+rbp] adcx r13,rax adox r14,r15 - mulx r15,rax,QWORD PTR[112+rbp] + mulx r15,rax,QWORD PTR[56+rbp] mov rdx,QWORD PTR[((72+48+8))+rcx*8+rsp] adcx r14,rax adox r15,rsi @@ -2912,7 +3303,7 @@ DB 0c4h,062h,0fbh,0f6h,0a5h,040h,000h,000h,000h sub rsi,QWORD PTR[((16+8))+rsp] mov rdx,QWORD PTR[((48+8))+rsp] - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] adc r8,QWORD PTR[rdi] adc r9,QWORD PTR[8+rdi] adc r10,QWORD PTR[16+rdi] @@ -2948,7 +3339,7 @@ $L$sqrx8x_no_tail:: adc r8,QWORD PTR[rdi] DB 102,72,15,126,217 adc r9,QWORD PTR[8+rdi] - mov rsi,QWORD PTR[112+rbp] + mov rsi,QWORD PTR[56+rbp] DB 102,72,15,126,213 adc r10,QWORD PTR[16+rdi] adc r11,QWORD PTR[24+rdi] @@ -2974,45 +3365,58 @@ DB 102,72,15,126,213 lea rdi,QWORD PTR[64+rcx*1+rdi] cmp r8,QWORD PTR[((8+8))+rsp] jb $L$sqrx8x_reduction_loop - xor ebx,ebx - sub rsi,r15 - adc rbx,rbx + DB 0F3h,0C3h ;repret +bn_sqrx8x_internal ENDP +ALIGN 32 +__bn_postx4x_internal:: + mov r12,QWORD PTR[rbp] mov r10,rcx - or rax,rbx mov r9,rcx - xor rax,1 + neg rax sar rcx,3+2 - lea rbp,QWORD PTR[rax*8+rbp] DB 102,72,15,126,202 DB 102,72,15,126,206 - jmp $L$sqrx4x_sub + dec r12 + mov r13,QWORD PTR[8+rbp] + xor r8,r8 + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqrx4x_sub_entry -ALIGN 32 +ALIGN 16 $L$sqrx4x_sub:: -DB 066h - mov r12,QWORD PTR[rdi] - mov r13,QWORD PTR[8+rdi] - sbb r12,QWORD PTR[rbp] - mov r14,QWORD PTR[16+rdi] - sbb r13,QWORD PTR[16+rbp] - mov r15,QWORD PTR[24+rdi] - lea rdi,QWORD PTR[32+rdi] - sbb r14,QWORD PTR[32+rbp] + mov r12,QWORD PTR[rbp] + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] +$L$sqrx4x_sub_entry:: + andn r12,r12,rax + lea rbp,QWORD PTR[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD PTR[rdi] + adc r13,QWORD PTR[8+rdi] + adc r14,QWORD PTR[16+rdi] + adc r15,QWORD PTR[24+rdi] mov QWORD PTR[rdx],r12 - sbb r15,QWORD PTR[48+rbp] - lea rbp,QWORD PTR[64+rbp] + lea rdi,QWORD PTR[32+rdi] mov QWORD PTR[8+rdx],r13 + sbb r8,r8 mov QWORD PTR[16+rdx],r14 mov QWORD PTR[24+rdx],r15 lea rdx,QWORD PTR[32+rdx] inc rcx jnz $L$sqrx4x_sub + neg r9 DB 0F3h,0C3h ;repret -bn_sqrx8x_internal ENDP + PUBLIC bn_get_bits5 ALIGN 16 @@ -3052,55 +3456,171 @@ bn_scatter5 ENDP PUBLIC bn_gather5 -ALIGN 16 +ALIGN 32 bn_gather5 PROC PUBLIC $L$SEH_begin_bn_gather5:: -DB 048h,083h,0ech,028h -DB 00fh,029h,034h,024h -DB 00fh,029h,07ch,024h,010h - mov r11d,r9d - shr r9d,3 - and r11,7 - not r9d - lea rax,QWORD PTR[$L$magic_masks] - and r9d,3 - lea r8,QWORD PTR[128+r11*8+r8] - movq xmm4,QWORD PTR[r9*8+rax] - movq xmm5,QWORD PTR[8+r9*8+rax] - movq xmm6,QWORD PTR[16+r9*8+rax] - movq xmm7,QWORD PTR[24+r9*8+rax] +DB 04ch,08dh,014h,024h +DB 048h,081h,0ech,008h,001h,000h,000h + lea rax,QWORD PTR[$L$inc] + and rsp,-16 + + movd xmm5,r9d + movdqa xmm0,XMMWORD PTR[rax] + movdqa xmm1,XMMWORD PTR[16+rax] + lea r11,QWORD PTR[128+r8] + lea rax,QWORD PTR[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD PTR[112+rax],xmm3 jmp $L$gather -ALIGN 16 -$L$gather:: - movq xmm0,QWORD PTR[((-128))+r8] - movq xmm1,QWORD PTR[((-64))+r8] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[r8] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[64+r8] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 -DB 067h,067h - por xmm0,xmm2 - lea r8,QWORD PTR[256+r8] - por xmm0,xmm3 +ALIGN 32 +$L$gather:: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r11] + movdqa xmm1,XMMWORD PTR[((-112))+r11] + movdqa xmm2,XMMWORD PTR[((-96))+r11] + pand xmm0,XMMWORD PTR[((-128))+rax] + movdqa xmm3,XMMWORD PTR[((-80))+r11] + pand xmm1,XMMWORD PTR[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r11] + movdqa xmm1,XMMWORD PTR[((-48))+r11] + movdqa xmm2,XMMWORD PTR[((-32))+r11] + pand xmm0,XMMWORD PTR[((-64))+rax] + movdqa xmm3,XMMWORD PTR[((-16))+r11] + pand xmm1,XMMWORD PTR[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r11] + movdqa xmm1,XMMWORD PTR[16+r11] + movdqa xmm2,XMMWORD PTR[32+r11] + pand xmm0,XMMWORD PTR[rax] + movdqa xmm3,XMMWORD PTR[48+r11] + pand xmm1,XMMWORD PTR[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r11] + movdqa xmm1,XMMWORD PTR[80+r11] + movdqa xmm2,XMMWORD PTR[96+r11] + pand xmm0,XMMWORD PTR[64+rax] + movdqa xmm3,XMMWORD PTR[112+r11] + pand xmm1,XMMWORD PTR[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,QWORD PTR[256+r11] + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 movq QWORD PTR[rcx],xmm0 lea rcx,QWORD PTR[8+rcx] sub edx,1 jnz $L$gather - movaps xmm6,XMMWORD PTR[rsp] - movaps xmm7,XMMWORD PTR[16+rsp] - lea rsp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[r10] DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5:: bn_gather5 ENDP ALIGN 64 -$L$magic_masks:: - DD 0,0,0,0,0,0,-1,-1 - DD 0,0,0,0,0,0,0,0 +$L$inc:: + DD 0,0,1,1 + DD 2,2,2,2 DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 @@ -3142,19 +3662,16 @@ mul_handler PROC PRIVATE lea r10,QWORD PTR[$L$mul_epilogue] cmp rbx,r10 - jb $L$body_40 + ja $L$body_40 mov r10,QWORD PTR[192+r8] mov rax,QWORD PTR[8+r10*8+rax] + jmp $L$body_proceed $L$body_40:: mov rax,QWORD PTR[40+rax] $L$body_proceed:: - - movaps xmm0,XMMWORD PTR[((-88))+rax] - movaps xmm1,XMMWORD PTR[((-72))+rax] - mov rbx,QWORD PTR[((-8))+rax] mov rbp,QWORD PTR[((-16))+rax] mov r12,QWORD PTR[((-24))+rax] @@ -3167,8 +3684,6 @@ $L$body_proceed:: mov QWORD PTR[224+r8],r13 mov QWORD PTR[232+r8],r14 mov QWORD PTR[240+r8],r15 - movups XMMWORD PTR[512+r8],xmm0 - movups XMMWORD PTR[528+r8],xmm1 $L$common_seh_tail:: mov rdi,QWORD PTR[8+rax] @@ -3273,10 +3788,9 @@ DB 9,0,0,0 DD imagerel $L$powerx5_body,imagerel $L$powerx5_epilogue ALIGN 8 $L$SEH_info_bn_gather5:: -DB 001h,00dh,005h,000h -DB 00dh,078h,001h,000h -DB 008h,068h,000h,000h -DB 004h,042h,000h,000h +DB 001h,00bh,003h,00ah +DB 00bh,001h,021h,000h +DB 004h,0a3h,000h,000h ALIGN 8 .xdata ENDS diff --git a/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm b/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm index 3fa69816b50e7b..f38d253c166748 100644 --- a/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm +++ b/deps/openssl/asm/x64-win32-masm/ec/ecp_nistz256-x86_64.asm @@ -1813,6 +1813,7 @@ $L$SEH_begin_ecp_nistz256_point_double:: push r15 sub rsp,32*5+8 +$L$point_double_shortcutq:: movdqu xmm0,XMMWORD PTR[rsi] mov rbx,rsi movdqu xmm1,XMMWORD PTR[16+rsi] @@ -2091,6 +2092,7 @@ DB 102,72,15,110,199 mov r14,QWORD PTR[((64+8))+rbx] mov r15,QWORD PTR[((64+16))+rbx] mov r8,QWORD PTR[((64+24))+rbx] +DB 102,72,15,110,203 lea rsi,QWORD PTR[((64-0))+rbx] lea rdi,QWORD PTR[32+rsp] @@ -2182,7 +2184,7 @@ DB 102,73,15,126,217 test r8,r8 jnz $L$add_proceedq test r9,r9 - jz $L$add_proceedq + jz $L$add_doubleq DB 102,72,15,126,199 pxor xmm0,xmm0 @@ -2194,6 +2196,13 @@ DB 102,72,15,126,199 movdqu XMMWORD PTR[80+rdi],xmm0 jmp $L$add_doneq +ALIGN 32 +$L$add_doubleq:: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + jmp $L$point_double_shortcutq + ALIGN 32 $L$add_proceedq:: mov rax,QWORD PTR[((0+64))+rsp] @@ -2876,6 +2885,7 @@ $L$point_doublex:: push r15 sub rsp,32*5+8 +$L$point_double_shortcutx:: movdqu xmm0,XMMWORD PTR[rsi] mov rbx,rsi movdqu xmm1,XMMWORD PTR[16+rsi] @@ -3150,6 +3160,7 @@ DB 102,72,15,110,199 mov r14,QWORD PTR[((64+8))+rbx] mov r15,QWORD PTR[((64+16))+rbx] mov r8,QWORD PTR[((64+24))+rbx] +DB 102,72,15,110,203 lea rsi,QWORD PTR[((64-128))+rbx] lea rdi,QWORD PTR[32+rsp] @@ -3241,7 +3252,7 @@ DB 102,73,15,126,217 test r8,r8 jnz $L$add_proceedx test r9,r9 - jz $L$add_proceedx + jz $L$add_doublex DB 102,72,15,126,199 pxor xmm0,xmm0 @@ -3253,6 +3264,13 @@ DB 102,72,15,126,199 movdqu XMMWORD PTR[80+rdi],xmm0 jmp $L$add_donex +ALIGN 32 +$L$add_doublex:: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + jmp $L$point_double_shortcutx + ALIGN 32 $L$add_proceedx:: mov rdx,QWORD PTR[((0+64))+rsp] diff --git a/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm b/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm index 0626d8f7824646..6552f7d017f6a4 100644 --- a/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm +++ b/deps/openssl/asm/x64-win32-masm/modes/aesni-gcm-x86_64.asm @@ -412,7 +412,7 @@ $L$dec_no_key_aliasing:: vzeroupper movaps xmm6,XMMWORD PTR[((-216))+rax] - movaps xmm7,XMMWORD PTR[((-216))+rax] + movaps xmm7,XMMWORD PTR[((-200))+rax] movaps xmm8,XMMWORD PTR[((-184))+rax] movaps xmm9,XMMWORD PTR[((-168))+rax] movaps xmm10,XMMWORD PTR[((-152))+rax] diff --git a/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s b/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s index 8a9ef772b73414..816b1d55bb206b 100644 --- a/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s +++ b/deps/openssl/asm/x86-elf-gas/sha/sha1-586.s @@ -23,11 +23,6 @@ sha1_block_data_order: jz .L001x86 testl $536870912,%ecx jnz .Lshaext_shortcut - andl $268435456,%edx - andl $1073741824,%eax - orl %edx,%eax - cmpl $1342177280,%eax - je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: @@ -2785,1176 +2780,6 @@ _sha1_block_data_order_ssse3: popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 -.type _sha1_block_data_order_avx,@function -.align 16 -_sha1_block_data_order_avx: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L008pic_point -.L008pic_point: - popl %ebp - leal .LK_XX_XX-.L008pic_point(%ebp),%ebp -.Lavx_shortcut: - vzeroall - vmovdqa (%ebp),%xmm7 - vmovdqa 16(%ebp),%xmm0 - vmovdqa 32(%ebp),%xmm1 - vmovdqa 48(%ebp),%xmm2 - vmovdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - vmovdqa %xmm0,112(%esp) - vmovdqa %xmm1,128(%esp) - vmovdqa %xmm2,144(%esp) - shll $6,%edx - vmovdqa %xmm7,160(%esp) - addl %ebp,%edx - vmovdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - vmovdqu -64(%ebp),%xmm0 - vmovdqu -48(%ebp),%xmm1 - vmovdqu -32(%ebp),%xmm2 - vmovdqu -16(%ebp),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vmovdqa %xmm7,96(%esp) - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm7,%xmm0,%xmm4 - vpaddd %xmm7,%xmm1,%xmm5 - vpaddd %xmm7,%xmm2,%xmm6 - vmovdqa %xmm4,(%esp) - movl %ecx,%ebp - vmovdqa %xmm5,16(%esp) - xorl %edx,%ebp - vmovdqa %xmm6,32(%esp) - andl %ebp,%esi - jmp .L009loop -.align 16 -.L009loop: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%ebp - addl (%esp),%edi - vpaddd %xmm3,%xmm7,%xmm7 - vmovdqa %xmm0,64(%esp) - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%edi - vpxor %xmm2,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vmovdqa %xmm7,48(%esp) - movl %edi,%esi - addl 4(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%edi,%edi - addl %ebp,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm6 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm0 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrld $30,%xmm0,%xmm7 - vpor %xmm6,%xmm4,%xmm4 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - vpslld $2,%xmm0,%xmm0 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vpxor %xmm7,%xmm4,%xmm4 - movl %ecx,%esi - addl 12(%esp),%ebx - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpxor %xmm0,%xmm4,%xmm4 - addl %ebp,%ebx - andl %edx,%esi - vmovdqa 96(%esp),%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%ebp - addl 16(%esp),%eax - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqa %xmm1,80(%esp) - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vmovdqa %xmm0,(%esp) - movl %eax,%esi - addl 20(%esp),%edi - vpxor %xmm7,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %ebp,%edi - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm7 - xorl %ecx,%ebx - addl %eax,%edi - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm1 - vpaddd %xmm5,%xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm0 - vpor %xmm7,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpxor %xmm0,%xmm5,%xmm5 - movl %edx,%esi - addl 28(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpxor %xmm1,%xmm5,%xmm5 - addl %ebp,%ecx - andl %edi,%esi - vmovdqa 112(%esp),%xmm1 - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%ebp - addl 32(%esp),%ebx - vpaddd %xmm5,%xmm1,%xmm1 - vmovdqa %xmm2,96(%esp) - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - vpxor %xmm2,%xmm6,%xmm6 - xorl %edi,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%ecx,%ecx - xorl %edi,%ebp - vmovdqa %xmm1,16(%esp) - movl %ebx,%esi - addl 36(%esp),%eax - vpxor %xmm0,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - addl %ebp,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm2 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm1 - vpor %xmm0,%xmm6,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - vmovdqa 64(%esp),%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vpxor %xmm1,%xmm6,%xmm6 - movl %edi,%esi - addl 44(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpxor %xmm2,%xmm6,%xmm6 - addl %ebp,%edx - andl %eax,%esi - vmovdqa 112(%esp),%xmm2 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%ebp - addl 48(%esp),%ecx - vpaddd %xmm6,%xmm2,%xmm2 - vmovdqa %xmm3,64(%esp) - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm1 - addl %esi,%ecx - andl %edi,%ebp - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%edi - addl %edx,%ecx - vpxor %xmm5,%xmm1,%xmm1 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vmovdqa %xmm2,32(%esp) - movl %ecx,%esi - addl 52(%esp),%ebx - vpxor %xmm1,%xmm7,%xmm7 - xorl %edi,%edx - shldl $5,%ecx,%ecx - addl %ebp,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm1 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpslldq $12,%xmm7,%xmm3 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm2 - vpor %xmm1,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - vmovdqa 80(%esp),%xmm1 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vpxor %xmm2,%xmm7,%xmm7 - movl %eax,%esi - addl 60(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpxor %xmm3,%xmm7,%xmm7 - addl %ebp,%edi - andl %ebx,%esi - vmovdqa 112(%esp),%xmm3 - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,80(%esp) - xorl %ebx,%eax - shldl $5,%edi,%edi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - addl %esi,%edx - andl %eax,%ebp - vpxor %xmm2,%xmm0,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - vpor %xmm2,%xmm0,%xmm0 - xorl %edi,%edx - shldl $5,%ecx,%ecx - vmovdqa 96(%esp),%xmm2 - addl %esi,%ebx - andl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm3,%xmm1,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm3,%xmm1,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - vmovdqa 64(%esp),%xmm3 - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - vmovdqa 128(%esp),%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm4,%xmm2,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vpor %xmm4,%xmm2,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - vmovdqa 80(%esp),%xmm4 - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - vmovdqa 96(%esp),%xmm5 - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpalignr $8,%xmm2,%xmm3,%xmm6 - vpxor %xmm0,%xmm4,%xmm4 - addl (%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - vmovdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - vmovdqa %xmm7,%xmm0 - vpaddd %xmm3,%xmm7,%xmm7 - shrdl $7,%edi,%edi - addl %edx,%ecx - vpxor %xmm6,%xmm4,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm6 - vmovdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm6,%xmm4,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - vmovdqa 64(%esp),%xmm6 - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpalignr $8,%xmm3,%xmm4,%xmm7 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - vpxor %xmm6,%xmm5,%xmm5 - vmovdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - vmovdqa %xmm0,%xmm1 - vpaddd %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - addl %edi,%edx - vpxor %xmm7,%xmm5,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm7 - vmovdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm7,%xmm5,%xmm5 - addl 28(%esp),%eax - vmovdqa 80(%esp),%xmm7 - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - vmovdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - vmovdqa %xmm1,%xmm2 - vpaddd %xmm5,%xmm1,%xmm1 - shldl $5,%eax,%eax - addl %esi,%edi - vpxor %xmm0,%xmm6,%xmm6 - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - vpsrld $30,%xmm6,%xmm0 - vmovdqa %xmm1,16(%esp) - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - vpor %xmm0,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%edi,%edi - vmovdqa 96(%esp),%xmm0 - movl %edx,%ebp - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm1 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - vmovdqa 144(%esp),%xmm3 - vpaddd %xmm6,%xmm2,%xmm2 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - vpsrld $30,%xmm7,%xmm1 - vmovdqa %xmm2,32(%esp) - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - vpor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vmovdqa 64(%esp),%xmm1 - movl %edi,%ebp - xorl %ebx,%esi - shldl $5,%edi,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - addl (%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm2,%xmm0,%xmm0 - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - vpor %xmm2,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vmovdqa 80(%esp),%xmm2 - movl %eax,%ebp - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%edi,%edi - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm3,%xmm1,%xmm1 - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - vpor %xmm3,%xmm1,%xmm1 - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vmovdqa 96(%esp),%xmm3 - movl %ebx,%ebp - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - vmovdqa %xmm5,%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shldl $5,%edi,%edi - addl %esi,%edx - vpxor %xmm4,%xmm2,%xmm2 - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - vpor %xmm4,%xmm2,%xmm2 - xorl %eax,%edi - shrdl $7,%edx,%edx - vmovdqa 64(%esp),%xmm4 - movl %ecx,%ebp - xorl %edi,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl (%esp),%eax - vpaddd %xmm3,%xmm7,%xmm7 - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm7,48(%esp) - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je .L010done - vmovdqa 160(%esp),%xmm7 - vmovdqa 176(%esp),%xmm6 - vmovdqu (%ebp),%xmm0 - vmovdqu 16(%ebp),%xmm1 - vmovdqu 32(%ebp),%xmm2 - vmovdqu 48(%ebp),%xmm3 - addl $64,%ebp - vpshufb %xmm6,%xmm0,%xmm0 - movl %ebp,196(%esp) - vmovdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpaddd %xmm7,%xmm0,%xmm4 - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,(%esp) - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%ebp - shldl $5,%edx,%edx - vpaddd %xmm7,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vmovdqa %xmm5,16(%esp) - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %edi,%ebp - shldl $5,%edi,%edi - vpaddd %xmm7,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vmovdqa %xmm6,32(%esp) - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,%ebx - movl %ecx,8(%ebp) - xorl %edx,%ebx - movl %edx,12(%ebp) - movl %edi,16(%ebp) - movl %esi,%ebp - andl %ebx,%esi - movl %ebp,%ebx - jmp .L009loop -.align 16 -.L010done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroall - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s b/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s index b434e42babe16e..836d91886bda53 100644 --- a/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s +++ b/deps/openssl/asm/x86-elf-gas/sha/sha256-586.s @@ -40,13 +40,12 @@ sha256_block_data_order: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx - je .L005AVX testl $512,%ebx - jnz .L006SSSE3 + jnz .L005SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L007unrolled + jae .L006unrolled jmp .L002loop .align 16 .L002loop: @@ -118,7 +117,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00800_15: +.L00700_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -156,11 +155,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00800_15 + jne .L00700_15 movl 156(%esp),%ecx - jmp .L00916_63 + jmp .L00816_63 .align 16 -.L00916_63: +.L00816_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -215,7 +214,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00916_63 + jne .L00816_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -259,7 +258,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L007unrolled: +.L006unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -276,9 +275,9 @@ sha256_block_data_order: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L010grand_loop + jmp .L009grand_loop .align 16 -.L010grand_loop: +.L009grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -3158,7 +3157,7 @@ sha256_block_data_order: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L010grand_loop + jb .L009grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -3177,9 +3176,9 @@ sha256_block_data_order: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L011loop_shaext + jmp .L010loop_shaext .align 16 -.L011loop_shaext: +.L010loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -3349,7 +3348,7 @@ sha256_block_data_order: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L011loop_shaext + jnz .L010loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -3364,7 +3363,7 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L006SSSE3: +.L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -3383,9 +3382,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L012grand_ssse3 + jmp .L011grand_ssse3 .align 16 -.L012grand_ssse3: +.L011grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3408,9 +3407,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L013ssse3_00_47 + jmp .L012ssse3_00_47 .align 16 -.L013ssse3_00_47: +.L012ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4053,7 +4052,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L013ssse3_00_47 + jne .L012ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4567,2217 +4566,12 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L012grand_ssse3 + jb .L011grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.align 32 -.L005AVX: - andl $264,%edx - cmpl $264,%edx - je .L014AVX_BMI - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp .L015grand_avx -.align 32 -.L015grand_avx: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp .L016avx_00_47 -.align 16 -.L016avx_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm0,%xmm0 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm0,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd (%ebp),%xmm0,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm1,%xmm1 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm1,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm2,%xmm2 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm2,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm3,%xmm3 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm3,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L016avx_00_47 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L015grand_avx - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 -.L014AVX_BMI: - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp .L017grand_avx_bmi -.align 32 -.L017grand_avx_bmi: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp .L018avx_bmi_00_47 -.align 16 -.L018avx_bmi_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - vpaddd %xmm4,%xmm0,%xmm0 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm0,%xmm0 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm0,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - vpaddd (%ebp),%xmm0,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - vpaddd %xmm4,%xmm1,%xmm1 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm1,%xmm1 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm1,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm0,%xmm1,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - vpaddd %xmm4,%xmm2,%xmm2 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm2,%xmm2 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm2,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - vpaddd %xmm4,%xmm3,%xmm3 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm3,%xmm3 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm3,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L018avx_bmi_00_47 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L017grand_avx_bmi - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 diff --git a/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s b/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s index d75e61693d5de1..a0fe22eb2eca7b 100644 --- a/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s +++ b/deps/openssl/asm/x86-macosx-gas/sha/sha1-586.s @@ -22,11 +22,6 @@ L000pic_point: jz L001x86 testl $536870912,%ecx jnz Lshaext_shortcut - andl $268435456,%edx - andl $1073741824,%eax - orl %edx,%eax - cmpl $1342177280,%eax - je Lavx_shortcut jmp Lssse3_shortcut .align 4,0x90 L001x86: @@ -2779,1174 +2774,6 @@ L007done: popl %ebx popl %ebp ret -.align 4 -__sha1_block_data_order_avx: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L008pic_point -L008pic_point: - popl %ebp - leal LK_XX_XX-L008pic_point(%ebp),%ebp -Lavx_shortcut: - vzeroall - vmovdqa (%ebp),%xmm7 - vmovdqa 16(%ebp),%xmm0 - vmovdqa 32(%ebp),%xmm1 - vmovdqa 48(%ebp),%xmm2 - vmovdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - vmovdqa %xmm0,112(%esp) - vmovdqa %xmm1,128(%esp) - vmovdqa %xmm2,144(%esp) - shll $6,%edx - vmovdqa %xmm7,160(%esp) - addl %ebp,%edx - vmovdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - vmovdqu -64(%ebp),%xmm0 - vmovdqu -48(%ebp),%xmm1 - vmovdqu -32(%ebp),%xmm2 - vmovdqu -16(%ebp),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vmovdqa %xmm7,96(%esp) - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm7,%xmm0,%xmm4 - vpaddd %xmm7,%xmm1,%xmm5 - vpaddd %xmm7,%xmm2,%xmm6 - vmovdqa %xmm4,(%esp) - movl %ecx,%ebp - vmovdqa %xmm5,16(%esp) - xorl %edx,%ebp - vmovdqa %xmm6,32(%esp) - andl %ebp,%esi - jmp L009loop -.align 4,0x90 -L009loop: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%ebp - addl (%esp),%edi - vpaddd %xmm3,%xmm7,%xmm7 - vmovdqa %xmm0,64(%esp) - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%edi - vpxor %xmm2,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vmovdqa %xmm7,48(%esp) - movl %edi,%esi - addl 4(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%edi,%edi - addl %ebp,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm6 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm0 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrld $30,%xmm0,%xmm7 - vpor %xmm6,%xmm4,%xmm4 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - vpslld $2,%xmm0,%xmm0 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vpxor %xmm7,%xmm4,%xmm4 - movl %ecx,%esi - addl 12(%esp),%ebx - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpxor %xmm0,%xmm4,%xmm4 - addl %ebp,%ebx - andl %edx,%esi - vmovdqa 96(%esp),%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%ebp - addl 16(%esp),%eax - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqa %xmm1,80(%esp) - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vmovdqa %xmm0,(%esp) - movl %eax,%esi - addl 20(%esp),%edi - vpxor %xmm7,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %ebp,%edi - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm7 - xorl %ecx,%ebx - addl %eax,%edi - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm1 - vpaddd %xmm5,%xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm0 - vpor %xmm7,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpxor %xmm0,%xmm5,%xmm5 - movl %edx,%esi - addl 28(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpxor %xmm1,%xmm5,%xmm5 - addl %ebp,%ecx - andl %edi,%esi - vmovdqa 112(%esp),%xmm1 - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%ebp - addl 32(%esp),%ebx - vpaddd %xmm5,%xmm1,%xmm1 - vmovdqa %xmm2,96(%esp) - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - vpxor %xmm2,%xmm6,%xmm6 - xorl %edi,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%ecx,%ecx - xorl %edi,%ebp - vmovdqa %xmm1,16(%esp) - movl %ebx,%esi - addl 36(%esp),%eax - vpxor %xmm0,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - addl %ebp,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm2 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm1 - vpor %xmm0,%xmm6,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - vmovdqa 64(%esp),%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vpxor %xmm1,%xmm6,%xmm6 - movl %edi,%esi - addl 44(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpxor %xmm2,%xmm6,%xmm6 - addl %ebp,%edx - andl %eax,%esi - vmovdqa 112(%esp),%xmm2 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%ebp - addl 48(%esp),%ecx - vpaddd %xmm6,%xmm2,%xmm2 - vmovdqa %xmm3,64(%esp) - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm1 - addl %esi,%ecx - andl %edi,%ebp - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%edi - addl %edx,%ecx - vpxor %xmm5,%xmm1,%xmm1 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vmovdqa %xmm2,32(%esp) - movl %ecx,%esi - addl 52(%esp),%ebx - vpxor %xmm1,%xmm7,%xmm7 - xorl %edi,%edx - shldl $5,%ecx,%ecx - addl %ebp,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm1 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpslldq $12,%xmm7,%xmm3 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm2 - vpor %xmm1,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - vmovdqa 80(%esp),%xmm1 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vpxor %xmm2,%xmm7,%xmm7 - movl %eax,%esi - addl 60(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpxor %xmm3,%xmm7,%xmm7 - addl %ebp,%edi - andl %ebx,%esi - vmovdqa 112(%esp),%xmm3 - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,80(%esp) - xorl %ebx,%eax - shldl $5,%edi,%edi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - addl %esi,%edx - andl %eax,%ebp - vpxor %xmm2,%xmm0,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - vpor %xmm2,%xmm0,%xmm0 - xorl %edi,%edx - shldl $5,%ecx,%ecx - vmovdqa 96(%esp),%xmm2 - addl %esi,%ebx - andl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm3,%xmm1,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm3,%xmm1,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - vmovdqa 64(%esp),%xmm3 - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - vmovdqa 128(%esp),%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm4,%xmm2,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vpor %xmm4,%xmm2,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - vmovdqa 80(%esp),%xmm4 - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - vmovdqa 96(%esp),%xmm5 - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpalignr $8,%xmm2,%xmm3,%xmm6 - vpxor %xmm0,%xmm4,%xmm4 - addl (%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - vmovdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - vmovdqa %xmm7,%xmm0 - vpaddd %xmm3,%xmm7,%xmm7 - shrdl $7,%edi,%edi - addl %edx,%ecx - vpxor %xmm6,%xmm4,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm6 - vmovdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm6,%xmm4,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - vmovdqa 64(%esp),%xmm6 - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpalignr $8,%xmm3,%xmm4,%xmm7 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - vpxor %xmm6,%xmm5,%xmm5 - vmovdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - vmovdqa %xmm0,%xmm1 - vpaddd %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - addl %edi,%edx - vpxor %xmm7,%xmm5,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm7 - vmovdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm7,%xmm5,%xmm5 - addl 28(%esp),%eax - vmovdqa 80(%esp),%xmm7 - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - vmovdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - vmovdqa %xmm1,%xmm2 - vpaddd %xmm5,%xmm1,%xmm1 - shldl $5,%eax,%eax - addl %esi,%edi - vpxor %xmm0,%xmm6,%xmm6 - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - vpsrld $30,%xmm6,%xmm0 - vmovdqa %xmm1,16(%esp) - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - vpor %xmm0,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%edi,%edi - vmovdqa 96(%esp),%xmm0 - movl %edx,%ebp - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm1 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - vmovdqa 144(%esp),%xmm3 - vpaddd %xmm6,%xmm2,%xmm2 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - vpsrld $30,%xmm7,%xmm1 - vmovdqa %xmm2,32(%esp) - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - vpor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vmovdqa 64(%esp),%xmm1 - movl %edi,%ebp - xorl %ebx,%esi - shldl $5,%edi,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - addl (%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm2,%xmm0,%xmm0 - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - vpor %xmm2,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vmovdqa 80(%esp),%xmm2 - movl %eax,%ebp - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%edi,%edi - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm3,%xmm1,%xmm1 - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - vpor %xmm3,%xmm1,%xmm1 - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vmovdqa 96(%esp),%xmm3 - movl %ebx,%ebp - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - vmovdqa %xmm5,%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shldl $5,%edi,%edi - addl %esi,%edx - vpxor %xmm4,%xmm2,%xmm2 - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - vpor %xmm4,%xmm2,%xmm2 - xorl %eax,%edi - shrdl $7,%edx,%edx - vmovdqa 64(%esp),%xmm4 - movl %ecx,%ebp - xorl %edi,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl (%esp),%eax - vpaddd %xmm3,%xmm7,%xmm7 - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm7,48(%esp) - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je L010done - vmovdqa 160(%esp),%xmm7 - vmovdqa 176(%esp),%xmm6 - vmovdqu (%ebp),%xmm0 - vmovdqu 16(%ebp),%xmm1 - vmovdqu 32(%ebp),%xmm2 - vmovdqu 48(%ebp),%xmm3 - addl $64,%ebp - vpshufb %xmm6,%xmm0,%xmm0 - movl %ebp,196(%esp) - vmovdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpaddd %xmm7,%xmm0,%xmm4 - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,(%esp) - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%ebp - shldl $5,%edx,%edx - vpaddd %xmm7,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vmovdqa %xmm5,16(%esp) - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %edi,%ebp - shldl $5,%edi,%edi - vpaddd %xmm7,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vmovdqa %xmm6,32(%esp) - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,%ebx - movl %ecx,8(%ebp) - xorl %edx,%ebx - movl %edx,12(%ebp) - movl %edi,16(%ebp) - movl %esi,%ebp - andl %ebx,%esi - movl %ebp,%ebx - jmp L009loop -.align 4,0x90 -L010done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroall - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .align 6,0x90 LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s b/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s index d30c582726c2be..37f532aa5fb686 100644 --- a/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s +++ b/deps/openssl/asm/x86-macosx-gas/sha/sha256-586.s @@ -39,13 +39,12 @@ L000pic_point: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx - je L005AVX testl $512,%ebx - jnz L006SSSE3 + jnz L005SSSE3 L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae L007unrolled + jae L006unrolled jmp L002loop .align 4,0x90 L002loop: @@ -117,7 +116,7 @@ L002loop: movl %ecx,28(%esp) movl %edi,32(%esp) .align 4,0x90 -L00800_15: +L00700_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -155,11 +154,11 @@ L00800_15: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne L00800_15 + jne L00700_15 movl 156(%esp),%ecx - jmp L00916_63 + jmp L00816_63 .align 4,0x90 -L00916_63: +L00816_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -214,7 +213,7 @@ L00916_63: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne L00916_63 + jne L00816_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -258,7 +257,7 @@ L001K256: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 4,0x90 -L007unrolled: +L006unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -275,9 +274,9 @@ L007unrolled: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp L010grand_loop + jmp L009grand_loop .align 4,0x90 -L010grand_loop: +L009grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -3157,7 +3156,7 @@ L010grand_loop: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb L010grand_loop + jb L009grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -3176,9 +3175,9 @@ L004shaext: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp L011loop_shaext + jmp L010loop_shaext .align 4,0x90 -L011loop_shaext: +L010loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -3348,7 +3347,7 @@ L011loop_shaext: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz L011loop_shaext + jnz L010loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -3363,7 +3362,7 @@ L011loop_shaext: popl %ebp ret .align 5,0x90 -L006SSSE3: +L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -3382,9 +3381,9 @@ L006SSSE3: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp L012grand_ssse3 + jmp L011grand_ssse3 .align 4,0x90 -L012grand_ssse3: +L011grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3407,9 +3406,9 @@ L012grand_ssse3: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp L013ssse3_00_47 + jmp L012ssse3_00_47 .align 4,0x90 -L013ssse3_00_47: +L012ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4052,7 +4051,7 @@ L013ssse3_00_47: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne L013ssse3_00_47 + jne L012ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4566,2218 +4565,13 @@ L013ssse3_00_47: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb L012grand_ssse3 + jb L011grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret -.align 5,0x90 -L005AVX: - andl $264,%edx - cmpl $264,%edx - je L014AVX_BMI - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp L015grand_avx -.align 5,0x90 -L015grand_avx: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp L016avx_00_47 -.align 4,0x90 -L016avx_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm0,%xmm0 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm0,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd (%ebp),%xmm0,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm1,%xmm1 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm1,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm2,%xmm2 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm2,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm3,%xmm3 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm3,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne L016avx_00_47 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb L015grand_avx - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 -L014AVX_BMI: - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp L017grand_avx_bmi -.align 5,0x90 -L017grand_avx_bmi: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp L018avx_bmi_00_47 -.align 4,0x90 -L018avx_bmi_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - vpaddd %xmm4,%xmm0,%xmm0 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm0,%xmm0 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm0,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - vpaddd (%ebp),%xmm0,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - vpaddd %xmm4,%xmm1,%xmm1 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm1,%xmm1 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm1,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - vpalignr $4,%xmm0,%xmm1,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - vpaddd %xmm4,%xmm2,%xmm2 - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm2,%xmm2 - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm2,%xmm7 - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm7 - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - vpsrld $7,%xmm4,%xmm6 - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrld $3,%xmm4,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpslld $14,%xmm4,%xmm5 - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpxor %xmm6,%xmm7,%xmm4 - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - vpsrld $11,%xmm6,%xmm6 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpslld $11,%xmm5,%xmm5 - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpsrld $10,%xmm7,%xmm6 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpxor %xmm5,%xmm4,%xmm4 - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpsrlq $17,%xmm7,%xmm5 - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - vpaddd %xmm4,%xmm3,%xmm3 - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - vpxor %xmm5,%xmm6,%xmm6 - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpsrlq $19,%xmm7,%xmm7 - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - vpshufd $132,%xmm6,%xmm7 - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - vpsrldq $8,%xmm7,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - vpaddd %xmm7,%xmm3,%xmm3 - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - vpshufd $80,%xmm3,%xmm7 - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - vpsrld $10,%xmm7,%xmm6 - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - vpsrlq $17,%xmm7,%xmm5 - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - vpxor %xmm5,%xmm6,%xmm6 - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - vpsrlq $19,%xmm7,%xmm7 - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - vpshufd $232,%xmm6,%xmm7 - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - vpslldq $8,%xmm7,%xmm7 - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne L018avx_bmi_00_47 - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 32(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 36(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 40(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 44(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 48(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 52(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 56(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 60(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,16(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 24(%esp),%edx,%esi - xorl %edi,%ecx - andl 20(%esp),%edx - movl %eax,(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 4(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - andl %eax,%ebx - addl 64(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 12(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,12(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 20(%esp),%edx,%esi - xorl %edi,%ecx - andl 16(%esp),%edx - movl %ebx,28(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl (%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - andl %ebx,%eax - addl 68(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 8(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,8(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 16(%esp),%edx,%esi - xorl %edi,%ecx - andl 12(%esp),%edx - movl %eax,24(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 28(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - andl %eax,%ebx - addl 72(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 4(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,4(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 12(%esp),%edx,%esi - xorl %edi,%ecx - andl 8(%esp),%edx - movl %ebx,20(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 24(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - andl %ebx,%eax - addl 76(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl (%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 8(%esp),%edx,%esi - xorl %edi,%ecx - andl 4(%esp),%edx - movl %eax,16(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 20(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - andl %eax,%ebx - addl 80(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 28(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,28(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 4(%esp),%edx,%esi - xorl %edi,%ecx - andl (%esp),%edx - movl %ebx,12(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 16(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - andl %ebx,%eax - addl 84(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 24(%esp),%edx - leal (%eax,%ecx,1),%eax - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,24(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl (%esp),%edx,%esi - xorl %edi,%ecx - andl 28(%esp),%edx - movl %eax,8(%esp) - orl %esi,%edx - rorxl $2,%eax,%edi - rorxl $13,%eax,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%eax,%ecx - xorl %edi,%esi - movl 12(%esp),%edi - xorl %esi,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - andl %eax,%ebx - addl 88(%esp),%edx - xorl %edi,%ebx - addl %edx,%ecx - addl 20(%esp),%edx - leal (%ebx,%ecx,1),%ebx - rorxl $6,%edx,%ecx - rorxl $11,%edx,%esi - movl %edx,20(%esp) - rorxl $25,%edx,%edi - xorl %esi,%ecx - andnl 28(%esp),%edx,%esi - xorl %edi,%ecx - andl 24(%esp),%edx - movl %ebx,4(%esp) - orl %esi,%edx - rorxl $2,%ebx,%edi - rorxl $13,%ebx,%esi - leal (%edx,%ecx,1),%edx - rorxl $22,%ebx,%ecx - xorl %edi,%esi - movl 8(%esp),%edi - xorl %esi,%ecx - xorl %edi,%ebx - addl (%esp),%edx - andl %ebx,%eax - addl 92(%esp),%edx - xorl %edi,%eax - addl %edx,%ecx - addl 16(%esp),%edx - leal (%eax,%ecx,1),%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb L017grand_avx_bmi - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .section __IMPORT,__pointers,non_lazy_symbol_pointers L_OPENSSL_ia32cap_P$non_lazy_ptr: .indirect_symbol _OPENSSL_ia32cap_P diff --git a/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm b/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm index 4607eda762a75a..38aaf17445b4b8 100644 --- a/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm +++ b/deps/openssl/asm/x86-win32-masm/sha/sha1-586.asm @@ -39,11 +39,6 @@ $L000pic_point: jz $L001x86 test ecx,536870912 jnz $Lshaext_shortcut - and edx,268435456 - and eax,1073741824 - or eax,edx - cmp eax,1342177280 - je $Lavx_shortcut jmp $Lssse3_shortcut ALIGN 16 $L001x86: @@ -2799,1175 +2794,6 @@ $L007done: pop ebp ret __sha1_block_data_order_ssse3 ENDP -ALIGN 16 -__sha1_block_data_order_avx PROC PRIVATE - push ebp - push ebx - push esi - push edi - call $L008pic_point -$L008pic_point: - pop ebp - lea ebp,DWORD PTR ($LK_XX_XX-$L008pic_point)[ebp] -$Lavx_shortcut:: - vzeroall - vmovdqa xmm7,XMMWORD PTR [ebp] - vmovdqa xmm0,XMMWORD PTR 16[ebp] - vmovdqa xmm1,XMMWORD PTR 32[ebp] - vmovdqa xmm2,XMMWORD PTR 48[ebp] - vmovdqa xmm6,XMMWORD PTR 64[ebp] - mov edi,DWORD PTR 20[esp] - mov ebp,DWORD PTR 24[esp] - mov edx,DWORD PTR 28[esp] - mov esi,esp - sub esp,208 - and esp,-64 - vmovdqa XMMWORD PTR 112[esp],xmm0 - vmovdqa XMMWORD PTR 128[esp],xmm1 - vmovdqa XMMWORD PTR 144[esp],xmm2 - shl edx,6 - vmovdqa XMMWORD PTR 160[esp],xmm7 - add edx,ebp - vmovdqa XMMWORD PTR 176[esp],xmm6 - add ebp,64 - mov DWORD PTR 192[esp],edi - mov DWORD PTR 196[esp],ebp - mov DWORD PTR 200[esp],edx - mov DWORD PTR 204[esp],esi - mov eax,DWORD PTR [edi] - mov ebx,DWORD PTR 4[edi] - mov ecx,DWORD PTR 8[edi] - mov edx,DWORD PTR 12[edi] - mov edi,DWORD PTR 16[edi] - mov esi,ebx - vmovdqu xmm0,XMMWORD PTR [ebp-64] - vmovdqu xmm1,XMMWORD PTR [ebp-48] - vmovdqu xmm2,XMMWORD PTR [ebp-32] - vmovdqu xmm3,XMMWORD PTR [ebp-16] - vpshufb xmm0,xmm0,xmm6 - vpshufb xmm1,xmm1,xmm6 - vpshufb xmm2,xmm2,xmm6 - vmovdqa XMMWORD PTR 96[esp],xmm7 - vpshufb xmm3,xmm3,xmm6 - vpaddd xmm4,xmm0,xmm7 - vpaddd xmm5,xmm1,xmm7 - vpaddd xmm6,xmm2,xmm7 - vmovdqa XMMWORD PTR [esp],xmm4 - mov ebp,ecx - vmovdqa XMMWORD PTR 16[esp],xmm5 - xor ebp,edx - vmovdqa XMMWORD PTR 32[esp],xmm6 - and esi,ebp - jmp $L009loop -ALIGN 16 -$L009loop: - shrd ebx,ebx,2 - xor esi,edx - vpalignr xmm4,xmm1,xmm0,8 - mov ebp,eax - add edi,DWORD PTR [esp] - vpaddd xmm7,xmm7,xmm3 - vmovdqa XMMWORD PTR 64[esp],xmm0 - xor ebx,ecx - shld eax,eax,5 - vpsrldq xmm6,xmm3,4 - add edi,esi - and ebp,ebx - vpxor xmm4,xmm4,xmm0 - xor ebx,ecx - add edi,eax - vpxor xmm6,xmm6,xmm2 - shrd eax,eax,7 - xor ebp,ecx - vmovdqa XMMWORD PTR 48[esp],xmm7 - mov esi,edi - add edx,DWORD PTR 4[esp] - vpxor xmm4,xmm4,xmm6 - xor eax,ebx - shld edi,edi,5 - add edx,ebp - and esi,eax - vpsrld xmm6,xmm4,31 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpslldq xmm0,xmm4,12 - vpaddd xmm4,xmm4,xmm4 - mov ebp,edx - add ecx,DWORD PTR 8[esp] - xor edi,eax - shld edx,edx,5 - vpsrld xmm7,xmm0,30 - vpor xmm4,xmm4,xmm6 - add ecx,esi - and ebp,edi - xor edi,eax - add ecx,edx - vpslld xmm0,xmm0,2 - shrd edx,edx,7 - xor ebp,eax - vpxor xmm4,xmm4,xmm7 - mov esi,ecx - add ebx,DWORD PTR 12[esp] - xor edx,edi - shld ecx,ecx,5 - vpxor xmm4,xmm4,xmm0 - add ebx,ebp - and esi,edx - vmovdqa xmm0,XMMWORD PTR 96[esp] - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpalignr xmm5,xmm2,xmm1,8 - mov ebp,ebx - add eax,DWORD PTR 16[esp] - vpaddd xmm0,xmm0,xmm4 - vmovdqa XMMWORD PTR 80[esp],xmm1 - xor ecx,edx - shld ebx,ebx,5 - vpsrldq xmm7,xmm4,4 - add eax,esi - and ebp,ecx - vpxor xmm5,xmm5,xmm1 - xor ecx,edx - add eax,ebx - vpxor xmm7,xmm7,xmm3 - shrd ebx,ebx,7 - xor ebp,edx - vmovdqa XMMWORD PTR [esp],xmm0 - mov esi,eax - add edi,DWORD PTR 20[esp] - vpxor xmm5,xmm5,xmm7 - xor ebx,ecx - shld eax,eax,5 - add edi,ebp - and esi,ebx - vpsrld xmm7,xmm5,31 - xor ebx,ecx - add edi,eax - shrd eax,eax,7 - xor esi,ecx - vpslldq xmm1,xmm5,12 - vpaddd xmm5,xmm5,xmm5 - mov ebp,edi - add edx,DWORD PTR 24[esp] - xor eax,ebx - shld edi,edi,5 - vpsrld xmm0,xmm1,30 - vpor xmm5,xmm5,xmm7 - add edx,esi - and ebp,eax - xor eax,ebx - add edx,edi - vpslld xmm1,xmm1,2 - shrd edi,edi,7 - xor ebp,ebx - vpxor xmm5,xmm5,xmm0 - mov esi,edx - add ecx,DWORD PTR 28[esp] - xor edi,eax - shld edx,edx,5 - vpxor xmm5,xmm5,xmm1 - add ecx,ebp - and esi,edi - vmovdqa xmm1,XMMWORD PTR 112[esp] - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - vpalignr xmm6,xmm3,xmm2,8 - mov ebp,ecx - add ebx,DWORD PTR 32[esp] - vpaddd xmm1,xmm1,xmm5 - vmovdqa XMMWORD PTR 96[esp],xmm2 - xor edx,edi - shld ecx,ecx,5 - vpsrldq xmm0,xmm5,4 - add ebx,esi - and ebp,edx - vpxor xmm6,xmm6,xmm2 - xor edx,edi - add ebx,ecx - vpxor xmm0,xmm0,xmm4 - shrd ecx,ecx,7 - xor ebp,edi - vmovdqa XMMWORD PTR 16[esp],xmm1 - mov esi,ebx - add eax,DWORD PTR 36[esp] - vpxor xmm6,xmm6,xmm0 - xor ecx,edx - shld ebx,ebx,5 - add eax,ebp - and esi,ecx - vpsrld xmm0,xmm6,31 - xor ecx,edx - add eax,ebx - shrd ebx,ebx,7 - xor esi,edx - vpslldq xmm2,xmm6,12 - vpaddd xmm6,xmm6,xmm6 - mov ebp,eax - add edi,DWORD PTR 40[esp] - xor ebx,ecx - shld eax,eax,5 - vpsrld xmm1,xmm2,30 - vpor xmm6,xmm6,xmm0 - add edi,esi - and ebp,ebx - xor ebx,ecx - add edi,eax - vpslld xmm2,xmm2,2 - vmovdqa xmm0,XMMWORD PTR 64[esp] - shrd eax,eax,7 - xor ebp,ecx - vpxor xmm6,xmm6,xmm1 - mov esi,edi - add edx,DWORD PTR 44[esp] - xor eax,ebx - shld edi,edi,5 - vpxor xmm6,xmm6,xmm2 - add edx,ebp - and esi,eax - vmovdqa xmm2,XMMWORD PTR 112[esp] - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor esi,ebx - vpalignr xmm7,xmm4,xmm3,8 - mov ebp,edx - add ecx,DWORD PTR 48[esp] - vpaddd xmm2,xmm2,xmm6 - vmovdqa XMMWORD PTR 64[esp],xmm3 - xor edi,eax - shld edx,edx,5 - vpsrldq xmm1,xmm6,4 - add ecx,esi - and ebp,edi - vpxor xmm7,xmm7,xmm3 - xor edi,eax - add ecx,edx - vpxor xmm1,xmm1,xmm5 - shrd edx,edx,7 - xor ebp,eax - vmovdqa XMMWORD PTR 32[esp],xmm2 - mov esi,ecx - add ebx,DWORD PTR 52[esp] - vpxor xmm7,xmm7,xmm1 - xor edx,edi - shld ecx,ecx,5 - add ebx,ebp - and esi,edx - vpsrld xmm1,xmm7,31 - xor edx,edi - add ebx,ecx - shrd ecx,ecx,7 - xor esi,edi - vpslldq xmm3,xmm7,12 - vpaddd xmm7,xmm7,xmm7 - mov ebp,ebx - add eax,DWORD PTR 56[esp] - xor ecx,edx - shld ebx,ebx,5 - vpsrld xmm2,xmm3,30 - vpor xmm7,xmm7,xmm1 - add eax,esi - and ebp,ecx - xor ecx,edx - add eax,ebx - vpslld xmm3,xmm3,2 - vmovdqa xmm1,XMMWORD PTR 80[esp] - shrd ebx,ebx,7 - xor ebp,edx - vpxor xmm7,xmm7,xmm2 - mov esi,eax - add edi,DWORD PTR 60[esp] - xor ebx,ecx - shld eax,eax,5 - vpxor xmm7,xmm7,xmm3 - add edi,ebp - and esi,ebx - vmovdqa xmm3,XMMWORD PTR 112[esp] - xor ebx,ecx - add edi,eax - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - shrd eax,eax,7 - xor esi,ecx - mov ebp,edi - add edx,DWORD PTR [esp] - vpxor xmm0,xmm0,xmm1 - vmovdqa XMMWORD PTR 80[esp],xmm4 - xor eax,ebx - shld edi,edi,5 - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - add edx,esi - and ebp,eax - vpxor xmm0,xmm0,xmm2 - xor eax,ebx - add edx,edi - shrd edi,edi,7 - xor ebp,ebx - vpsrld xmm2,xmm0,30 - vmovdqa XMMWORD PTR 48[esp],xmm3 - mov esi,edx - add ecx,DWORD PTR 4[esp] - xor edi,eax - shld edx,edx,5 - vpslld xmm0,xmm0,2 - add ecx,ebp - and esi,edi - xor edi,eax - add ecx,edx - shrd edx,edx,7 - xor esi,eax - mov ebp,ecx - add ebx,DWORD PTR 8[esp] - vpor xmm0,xmm0,xmm2 - xor edx,edi - shld ecx,ecx,5 - vmovdqa xmm2,XMMWORD PTR 96[esp] - add ebx,esi - and ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD PTR 12[esp] - xor ebp,edi - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add edi,DWORD PTR 16[esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm1,xmm1,xmm2 - vmovdqa XMMWORD PTR 96[esp],xmm5 - add edi,esi - xor ebp,ecx - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm1,xmm1,xmm3 - add edx,DWORD PTR 20[esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm3,xmm1,30 - vmovdqa XMMWORD PTR [esp],xmm4 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm1,xmm1,2 - add ecx,DWORD PTR 24[esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm1,xmm1,xmm3 - add ebx,DWORD PTR 28[esp] - xor ebp,edi - vmovdqa xmm3,XMMWORD PTR 64[esp] - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add eax,DWORD PTR 32[esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - vpxor xmm2,xmm2,xmm3 - vmovdqa XMMWORD PTR 64[esp],xmm6 - add eax,esi - xor ebp,edx - vmovdqa xmm6,XMMWORD PTR 128[esp] - vpaddd xmm5,xmm5,xmm1 - shrd ecx,ecx,7 - add eax,ebx - vpxor xmm2,xmm2,xmm4 - add edi,DWORD PTR 36[esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - vpsrld xmm4,xmm2,30 - vmovdqa XMMWORD PTR 16[esp],xmm5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpslld xmm2,xmm2,2 - add edx,DWORD PTR 40[esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vpor xmm2,xmm2,xmm4 - add ecx,DWORD PTR 44[esp] - xor ebp,eax - vmovdqa xmm4,XMMWORD PTR 80[esp] - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add ebx,DWORD PTR 48[esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa XMMWORD PTR 80[esp],xmm7 - add ebx,esi - xor ebp,edi - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd edx,edx,7 - add ebx,ecx - vpxor xmm3,xmm3,xmm5 - add eax,DWORD PTR 52[esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - vpsrld xmm5,xmm3,30 - vmovdqa XMMWORD PTR 32[esp],xmm6 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - vpslld xmm3,xmm3,2 - add edi,DWORD PTR 56[esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - vpor xmm3,xmm3,xmm5 - add edx,DWORD PTR 60[esp] - xor ebp,ebx - vmovdqa xmm5,XMMWORD PTR 96[esp] - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpalignr xmm6,xmm3,xmm2,8 - vpxor xmm4,xmm4,xmm0 - add ecx,DWORD PTR [esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - vpxor xmm4,xmm4,xmm5 - vmovdqa XMMWORD PTR 96[esp],xmm0 - add ecx,esi - xor ebp,eax - vmovdqa xmm0,xmm7 - vpaddd xmm7,xmm7,xmm3 - shrd edi,edi,7 - add ecx,edx - vpxor xmm4,xmm4,xmm6 - add ebx,DWORD PTR 4[esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - vpsrld xmm6,xmm4,30 - vmovdqa XMMWORD PTR 48[esp],xmm7 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - vpslld xmm4,xmm4,2 - add eax,DWORD PTR 8[esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - vpor xmm4,xmm4,xmm6 - add edi,DWORD PTR 12[esp] - xor ebp,ecx - vmovdqa xmm6,XMMWORD PTR 64[esp] - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - vpalignr xmm7,xmm4,xmm3,8 - vpxor xmm5,xmm5,xmm1 - add edx,DWORD PTR 16[esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - vpxor xmm5,xmm5,xmm6 - vmovdqa XMMWORD PTR 64[esp],xmm1 - add edx,esi - xor ebp,ebx - vmovdqa xmm1,xmm0 - vpaddd xmm0,xmm0,xmm4 - shrd eax,eax,7 - add edx,edi - vpxor xmm5,xmm5,xmm7 - add ecx,DWORD PTR 20[esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - vpsrld xmm7,xmm5,30 - vmovdqa XMMWORD PTR [esp],xmm0 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - vpslld xmm5,xmm5,2 - add ebx,DWORD PTR 24[esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vpor xmm5,xmm5,xmm7 - add eax,DWORD PTR 28[esp] - vmovdqa xmm7,XMMWORD PTR 80[esp] - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - vpalignr xmm0,xmm5,xmm4,8 - vpxor xmm6,xmm6,xmm2 - add edi,DWORD PTR 32[esp] - and esi,ecx - xor ecx,edx - shrd ebx,ebx,7 - vpxor xmm6,xmm6,xmm7 - vmovdqa XMMWORD PTR 80[esp],xmm2 - mov ebp,eax - xor esi,ecx - vmovdqa xmm2,xmm1 - vpaddd xmm1,xmm1,xmm5 - shld eax,eax,5 - add edi,esi - vpxor xmm6,xmm6,xmm0 - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD PTR 36[esp] - vpsrld xmm0,xmm6,30 - vmovdqa XMMWORD PTR 16[esp],xmm1 - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - vpslld xmm6,xmm6,2 - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - add ecx,DWORD PTR 40[esp] - and esi,eax - vpor xmm6,xmm6,xmm0 - xor eax,ebx - shrd edi,edi,7 - vmovdqa xmm0,XMMWORD PTR 96[esp] - mov ebp,edx - xor esi,eax - shld edx,edx,5 - add ecx,esi - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD PTR 44[esp] - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - vpalignr xmm1,xmm6,xmm5,8 - vpxor xmm7,xmm7,xmm3 - add eax,DWORD PTR 48[esp] - and esi,edx - xor edx,edi - shrd ecx,ecx,7 - vpxor xmm7,xmm7,xmm0 - vmovdqa XMMWORD PTR 96[esp],xmm3 - mov ebp,ebx - xor esi,edx - vmovdqa xmm3,XMMWORD PTR 144[esp] - vpaddd xmm2,xmm2,xmm6 - shld ebx,ebx,5 - add eax,esi - vpxor xmm7,xmm7,xmm1 - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD PTR 52[esp] - vpsrld xmm1,xmm7,30 - vmovdqa XMMWORD PTR 32[esp],xmm2 - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - vpslld xmm7,xmm7,2 - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD PTR 56[esp] - and esi,ebx - vpor xmm7,xmm7,xmm1 - xor ebx,ecx - shrd eax,eax,7 - vmovdqa xmm1,XMMWORD PTR 64[esp] - mov ebp,edi - xor esi,ebx - shld edi,edi,5 - add edx,esi - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD PTR 60[esp] - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - vpalignr xmm2,xmm7,xmm6,8 - vpxor xmm0,xmm0,xmm4 - add ebx,DWORD PTR [esp] - and esi,edi - xor edi,eax - shrd edx,edx,7 - vpxor xmm0,xmm0,xmm1 - vmovdqa XMMWORD PTR 64[esp],xmm4 - mov ebp,ecx - xor esi,edi - vmovdqa xmm4,xmm3 - vpaddd xmm3,xmm3,xmm7 - shld ecx,ecx,5 - add ebx,esi - vpxor xmm0,xmm0,xmm2 - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD PTR 4[esp] - vpsrld xmm2,xmm0,30 - vmovdqa XMMWORD PTR 48[esp],xmm3 - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - vpslld xmm0,xmm0,2 - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD PTR 8[esp] - and esi,ecx - vpor xmm0,xmm0,xmm2 - xor ecx,edx - shrd ebx,ebx,7 - vmovdqa xmm2,XMMWORD PTR 80[esp] - mov ebp,eax - xor esi,ecx - shld eax,eax,5 - add edi,esi - xor ebp,ebx - xor ebx,ecx - add edi,eax - add edx,DWORD PTR 12[esp] - and ebp,ebx - xor ebx,ecx - shrd eax,eax,7 - mov esi,edi - xor ebp,ebx - shld edi,edi,5 - add edx,ebp - xor esi,eax - xor eax,ebx - add edx,edi - vpalignr xmm3,xmm0,xmm7,8 - vpxor xmm1,xmm1,xmm5 - add ecx,DWORD PTR 16[esp] - and esi,eax - xor eax,ebx - shrd edi,edi,7 - vpxor xmm1,xmm1,xmm2 - vmovdqa XMMWORD PTR 80[esp],xmm5 - mov ebp,edx - xor esi,eax - vmovdqa xmm5,xmm4 - vpaddd xmm4,xmm4,xmm0 - shld edx,edx,5 - add ecx,esi - vpxor xmm1,xmm1,xmm3 - xor ebp,edi - xor edi,eax - add ecx,edx - add ebx,DWORD PTR 20[esp] - vpsrld xmm3,xmm1,30 - vmovdqa XMMWORD PTR [esp],xmm4 - and ebp,edi - xor edi,eax - shrd edx,edx,7 - mov esi,ecx - vpslld xmm1,xmm1,2 - xor ebp,edi - shld ecx,ecx,5 - add ebx,ebp - xor esi,edx - xor edx,edi - add ebx,ecx - add eax,DWORD PTR 24[esp] - and esi,edx - vpor xmm1,xmm1,xmm3 - xor edx,edi - shrd ecx,ecx,7 - vmovdqa xmm3,XMMWORD PTR 96[esp] - mov ebp,ebx - xor esi,edx - shld ebx,ebx,5 - add eax,esi - xor ebp,ecx - xor ecx,edx - add eax,ebx - add edi,DWORD PTR 28[esp] - and ebp,ecx - xor ecx,edx - shrd ebx,ebx,7 - mov esi,eax - xor ebp,ecx - shld eax,eax,5 - add edi,ebp - xor esi,ebx - xor ebx,ecx - add edi,eax - vpalignr xmm4,xmm1,xmm0,8 - vpxor xmm2,xmm2,xmm6 - add edx,DWORD PTR 32[esp] - and esi,ebx - xor ebx,ecx - shrd eax,eax,7 - vpxor xmm2,xmm2,xmm3 - vmovdqa XMMWORD PTR 96[esp],xmm6 - mov ebp,edi - xor esi,ebx - vmovdqa xmm6,xmm5 - vpaddd xmm5,xmm5,xmm1 - shld edi,edi,5 - add edx,esi - vpxor xmm2,xmm2,xmm4 - xor ebp,eax - xor eax,ebx - add edx,edi - add ecx,DWORD PTR 36[esp] - vpsrld xmm4,xmm2,30 - vmovdqa XMMWORD PTR 16[esp],xmm5 - and ebp,eax - xor eax,ebx - shrd edi,edi,7 - mov esi,edx - vpslld xmm2,xmm2,2 - xor ebp,eax - shld edx,edx,5 - add ecx,ebp - xor esi,edi - xor edi,eax - add ecx,edx - add ebx,DWORD PTR 40[esp] - and esi,edi - vpor xmm2,xmm2,xmm4 - xor edi,eax - shrd edx,edx,7 - vmovdqa xmm4,XMMWORD PTR 64[esp] - mov ebp,ecx - xor esi,edi - shld ecx,ecx,5 - add ebx,esi - xor ebp,edx - xor edx,edi - add ebx,ecx - add eax,DWORD PTR 44[esp] - and ebp,edx - xor edx,edi - shrd ecx,ecx,7 - mov esi,ebx - xor ebp,edx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - add eax,ebx - vpalignr xmm5,xmm2,xmm1,8 - vpxor xmm3,xmm3,xmm7 - add edi,DWORD PTR 48[esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - vpxor xmm3,xmm3,xmm4 - vmovdqa XMMWORD PTR 64[esp],xmm7 - add edi,esi - xor ebp,ecx - vmovdqa xmm7,xmm6 - vpaddd xmm6,xmm6,xmm2 - shrd ebx,ebx,7 - add edi,eax - vpxor xmm3,xmm3,xmm5 - add edx,DWORD PTR 52[esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - vpsrld xmm5,xmm3,30 - vmovdqa XMMWORD PTR 32[esp],xmm6 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - vpslld xmm3,xmm3,2 - add ecx,DWORD PTR 56[esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vpor xmm3,xmm3,xmm5 - add ebx,DWORD PTR 60[esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR [esp] - vpaddd xmm7,xmm7,xmm3 - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - vmovdqa XMMWORD PTR 48[esp],xmm7 - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD PTR 4[esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD PTR 8[esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD PTR 12[esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - mov ebp,DWORD PTR 196[esp] - cmp ebp,DWORD PTR 200[esp] - je $L010done - vmovdqa xmm7,XMMWORD PTR 160[esp] - vmovdqa xmm6,XMMWORD PTR 176[esp] - vmovdqu xmm0,XMMWORD PTR [ebp] - vmovdqu xmm1,XMMWORD PTR 16[ebp] - vmovdqu xmm2,XMMWORD PTR 32[ebp] - vmovdqu xmm3,XMMWORD PTR 48[ebp] - add ebp,64 - vpshufb xmm0,xmm0,xmm6 - mov DWORD PTR 196[esp],ebp - vmovdqa XMMWORD PTR 96[esp],xmm7 - add ebx,DWORD PTR 16[esp] - xor esi,edi - vpshufb xmm1,xmm1,xmm6 - mov ebp,ecx - shld ecx,ecx,5 - vpaddd xmm4,xmm0,xmm7 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - vmovdqa XMMWORD PTR [esp],xmm4 - add eax,DWORD PTR 20[esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD PTR 24[esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD PTR 28[esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD PTR 32[esp] - xor esi,eax - vpshufb xmm2,xmm2,xmm6 - mov ebp,edx - shld edx,edx,5 - vpaddd xmm5,xmm1,xmm7 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - vmovdqa XMMWORD PTR 16[esp],xmm5 - add ebx,DWORD PTR 36[esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR 40[esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD PTR 44[esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD PTR 48[esp] - xor esi,ebx - vpshufb xmm3,xmm3,xmm6 - mov ebp,edi - shld edi,edi,5 - vpaddd xmm6,xmm2,xmm7 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - vmovdqa XMMWORD PTR 32[esp],xmm6 - add ecx,DWORD PTR 52[esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD PTR 56[esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR 60[esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - mov ebp,DWORD PTR 192[esp] - add eax,DWORD PTR [ebp] - add esi,DWORD PTR 4[ebp] - add ecx,DWORD PTR 8[ebp] - mov DWORD PTR [ebp],eax - add edx,DWORD PTR 12[ebp] - mov DWORD PTR 4[ebp],esi - add edi,DWORD PTR 16[ebp] - mov ebx,ecx - mov DWORD PTR 8[ebp],ecx - xor ebx,edx - mov DWORD PTR 12[ebp],edx - mov DWORD PTR 16[ebp],edi - mov ebp,esi - and esi,ebx - mov ebx,ebp - jmp $L009loop -ALIGN 16 -$L010done: - add ebx,DWORD PTR 16[esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR 20[esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - xor esi,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD PTR 24[esp] - xor esi,ecx - mov ebp,eax - shld eax,eax,5 - add edi,esi - xor ebp,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD PTR 28[esp] - xor ebp,ebx - mov esi,edi - shld edi,edi,5 - add edx,ebp - xor esi,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD PTR 32[esp] - xor esi,eax - mov ebp,edx - shld edx,edx,5 - add ecx,esi - xor ebp,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD PTR 36[esp] - xor ebp,edi - mov esi,ecx - shld ecx,ecx,5 - add ebx,ebp - xor esi,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR 40[esp] - xor esi,edx - mov ebp,ebx - shld ebx,ebx,5 - add eax,esi - xor ebp,edx - shrd ecx,ecx,7 - add eax,ebx - add edi,DWORD PTR 44[esp] - xor ebp,ecx - mov esi,eax - shld eax,eax,5 - add edi,ebp - xor esi,ecx - shrd ebx,ebx,7 - add edi,eax - add edx,DWORD PTR 48[esp] - xor esi,ebx - mov ebp,edi - shld edi,edi,5 - add edx,esi - xor ebp,ebx - shrd eax,eax,7 - add edx,edi - add ecx,DWORD PTR 52[esp] - xor ebp,eax - mov esi,edx - shld edx,edx,5 - add ecx,ebp - xor esi,eax - shrd edi,edi,7 - add ecx,edx - add ebx,DWORD PTR 56[esp] - xor esi,edi - mov ebp,ecx - shld ecx,ecx,5 - add ebx,esi - xor ebp,edi - shrd edx,edx,7 - add ebx,ecx - add eax,DWORD PTR 60[esp] - xor ebp,edx - mov esi,ebx - shld ebx,ebx,5 - add eax,ebp - shrd ecx,ecx,7 - add eax,ebx - vzeroall - mov ebp,DWORD PTR 192[esp] - add eax,DWORD PTR [ebp] - mov esp,DWORD PTR 204[esp] - add esi,DWORD PTR 4[ebp] - add ecx,DWORD PTR 8[ebp] - mov DWORD PTR [ebp],eax - add edx,DWORD PTR 12[ebp] - mov DWORD PTR 4[ebp],esi - add edi,DWORD PTR 16[ebp] - mov DWORD PTR 8[ebp],ecx - mov DWORD PTR 12[ebp],edx - mov DWORD PTR 16[ebp],edi - pop edi - pop esi - pop ebx - pop ebp - ret -__sha1_block_data_order_avx ENDP ALIGN 64 $LK_XX_XX:: DD 1518500249,1518500249,1518500249,1518500249 diff --git a/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm b/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm index d184877bb717a2..b6af4ab0640332 100644 --- a/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm +++ b/deps/openssl/asm/x86-win32-masm/sha/sha256-586.asm @@ -56,13 +56,12 @@ $L000pic_point: or ecx,ebx and ecx,1342177280 cmp ecx,1342177280 - je $L005AVX test ebx,512 - jnz $L006SSSE3 + jnz $L005SSSE3 $L003no_xmm: sub eax,edi cmp eax,256 - jae $L007unrolled + jae $L006unrolled jmp $L002loop ALIGN 16 $L002loop: @@ -134,7 +133,7 @@ $L002loop: mov DWORD PTR 28[esp],ecx mov DWORD PTR 32[esp],edi ALIGN 16 -$L00800_15: +$L00700_15: mov ecx,edx mov esi,DWORD PTR 24[esp] ror ecx,14 @@ -172,11 +171,11 @@ $L00800_15: add ebp,4 add eax,ebx cmp esi,3248222580 - jne $L00800_15 + jne $L00700_15 mov ecx,DWORD PTR 156[esp] - jmp $L00916_63 + jmp $L00816_63 ALIGN 16 -$L00916_63: +$L00816_63: mov ebx,ecx mov esi,DWORD PTR 104[esp] ror ecx,11 @@ -231,7 +230,7 @@ $L00916_63: add ebp,4 add eax,ebx cmp esi,3329325298 - jne $L00916_63 + jne $L00816_63 mov esi,DWORD PTR 356[esp] mov ebx,DWORD PTR 8[esp] mov ecx,DWORD PTR 16[esp] @@ -290,7 +289,7 @@ DB 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 DB 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 DB 62,0 ALIGN 16 -$L007unrolled: +$L006unrolled: lea esp,DWORD PTR [esp-96] mov eax,DWORD PTR [esi] mov ebp,DWORD PTR 4[esi] @@ -307,9 +306,9 @@ $L007unrolled: mov DWORD PTR 20[esp],ebx mov DWORD PTR 24[esp],ecx mov DWORD PTR 28[esp],esi - jmp $L010grand_loop + jmp $L009grand_loop ALIGN 16 -$L010grand_loop: +$L009grand_loop: mov ebx,DWORD PTR [edi] mov ecx,DWORD PTR 4[edi] bswap ebx @@ -3189,7 +3188,7 @@ $L010grand_loop: mov DWORD PTR 24[esp],ebx mov DWORD PTR 28[esp],ecx cmp edi,DWORD PTR 104[esp] - jb $L010grand_loop + jb $L009grand_loop mov esp,DWORD PTR 108[esp] pop edi pop esi @@ -3208,9 +3207,9 @@ $L004shaext: pshufd xmm2,xmm2,27 DB 102,15,58,15,202,8 punpcklqdq xmm2,xmm0 - jmp $L011loop_shaext + jmp $L010loop_shaext ALIGN 16 -$L011loop_shaext: +$L010loop_shaext: movdqu xmm3,XMMWORD PTR [edi] movdqu xmm4,XMMWORD PTR 16[edi] movdqu xmm5,XMMWORD PTR 32[edi] @@ -3380,7 +3379,7 @@ DB 15,56,203,209 DB 15,56,203,202 paddd xmm2,XMMWORD PTR 16[esp] paddd xmm1,XMMWORD PTR [esp] - jnz $L011loop_shaext + jnz $L010loop_shaext pshufd xmm2,xmm2,177 pshufd xmm7,xmm1,27 pshufd xmm1,xmm1,177 @@ -3395,7 +3394,7 @@ DB 102,15,58,15,215,8 pop ebp ret ALIGN 32 -$L006SSSE3: +$L005SSSE3: lea esp,DWORD PTR [esp-96] mov eax,DWORD PTR [esi] mov ebx,DWORD PTR 4[esi] @@ -3414,9 +3413,9 @@ $L006SSSE3: mov DWORD PTR 24[esp],ecx mov DWORD PTR 28[esp],esi movdqa xmm7,XMMWORD PTR 256[ebp] - jmp $L012grand_ssse3 + jmp $L011grand_ssse3 ALIGN 16 -$L012grand_ssse3: +$L011grand_ssse3: movdqu xmm0,XMMWORD PTR [edi] movdqu xmm1,XMMWORD PTR 16[edi] movdqu xmm2,XMMWORD PTR 32[edi] @@ -3439,9 +3438,9 @@ DB 102,15,56,0,223 paddd xmm7,xmm3 movdqa XMMWORD PTR 64[esp],xmm6 movdqa XMMWORD PTR 80[esp],xmm7 - jmp $L013ssse3_00_47 + jmp $L012ssse3_00_47 ALIGN 16 -$L013ssse3_00_47: +$L012ssse3_00_47: add ebp,64 mov ecx,edx movdqa xmm4,xmm1 @@ -4084,7 +4083,7 @@ DB 102,15,58,15,249,4 add eax,ecx movdqa XMMWORD PTR 80[esp],xmm6 cmp DWORD PTR 64[ebp],66051 - jne $L013ssse3_00_47 + jne $L012ssse3_00_47 mov ecx,edx ror edx,14 mov esi,DWORD PTR 20[esp] @@ -4598,2218 +4597,13 @@ DB 102,15,58,15,249,4 movdqa xmm7,XMMWORD PTR 64[ebp] sub ebp,192 cmp edi,DWORD PTR 104[esp] - jb $L012grand_ssse3 + jb $L011grand_ssse3 mov esp,DWORD PTR 108[esp] pop edi pop esi pop ebx pop ebp ret -ALIGN 32 -$L005AVX: - and edx,264 - cmp edx,264 - je $L014AVX_BMI - lea esp,DWORD PTR [esp-96] - vzeroall - mov eax,DWORD PTR [esi] - mov ebx,DWORD PTR 4[esi] - mov ecx,DWORD PTR 8[esi] - mov edi,DWORD PTR 12[esi] - mov DWORD PTR 4[esp],ebx - xor ebx,ecx - mov DWORD PTR 8[esp],ecx - mov DWORD PTR 12[esp],edi - mov edx,DWORD PTR 16[esi] - mov edi,DWORD PTR 20[esi] - mov ecx,DWORD PTR 24[esi] - mov esi,DWORD PTR 28[esi] - mov DWORD PTR 20[esp],edi - mov edi,DWORD PTR 100[esp] - mov DWORD PTR 24[esp],ecx - mov DWORD PTR 28[esp],esi - vmovdqa xmm7,XMMWORD PTR 256[ebp] - jmp $L015grand_avx -ALIGN 32 -$L015grand_avx: - vmovdqu xmm0,XMMWORD PTR [edi] - vmovdqu xmm1,XMMWORD PTR 16[edi] - vmovdqu xmm2,XMMWORD PTR 32[edi] - vmovdqu xmm3,XMMWORD PTR 48[edi] - add edi,64 - vpshufb xmm0,xmm0,xmm7 - mov DWORD PTR 100[esp],edi - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,XMMWORD PTR [ebp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,XMMWORD PTR 16[ebp] - vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp] - vpaddd xmm7,xmm3,XMMWORD PTR 48[ebp] - vmovdqa XMMWORD PTR 32[esp],xmm4 - vmovdqa XMMWORD PTR 48[esp],xmm5 - vmovdqa XMMWORD PTR 64[esp],xmm6 - vmovdqa XMMWORD PTR 80[esp],xmm7 - jmp $L016avx_00_47 -ALIGN 16 -$L016avx_00_47: - add ebp,64 - vpalignr xmm4,xmm1,xmm0,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 20[esp] - vpalignr xmm7,xmm3,xmm2,4 - xor edx,ecx - mov edi,DWORD PTR 24[esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 16[esp],ecx - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 4[esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 28[esp] - vpshufd xmm7,xmm3,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD PTR 32[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD PTR 12[esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 16[esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD PTR 20[esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 12[esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR [esp] - vpaddd xmm0,xmm0,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 28[esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 24[esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD PTR 36[esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD PTR 8[esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 12[esp] - vpaddd xmm0,xmm0,xmm7 - xor edx,ecx - mov edi,DWORD PTR 16[esp] - xor esi,edi - vpshufd xmm7,xmm0,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 8[esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 28[esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 24[esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 20[esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD PTR 40[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD PTR 4[esp] - add ebx,ecx - vpaddd xmm0,xmm0,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 8[esp] - vpaddd xmm6,xmm0,XMMWORD PTR [ebp] - xor edx,ecx - mov edi,DWORD PTR 12[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 4[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 24[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 20[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 16[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 44[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR [esp] - add eax,ecx - vmovdqa XMMWORD PTR 32[esp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 4[esp] - vpalignr xmm7,xmm0,xmm3,4 - xor edx,ecx - mov edi,DWORD PTR 8[esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR [esp],ecx - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 20[esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 16[esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 12[esp] - vpshufd xmm7,xmm0,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD PTR 48[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD PTR 28[esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD PTR 4[esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 28[esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 16[esp] - vpaddd xmm1,xmm1,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 12[esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 8[esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD PTR 52[esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD PTR 24[esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 28[esp] - vpaddd xmm1,xmm1,xmm7 - xor edx,ecx - mov edi,DWORD PTR [esp] - xor esi,edi - vpshufd xmm7,xmm1,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 24[esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 12[esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 8[esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 4[esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD PTR 56[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD PTR 20[esp] - add ebx,ecx - vpaddd xmm1,xmm1,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 24[esp] - vpaddd xmm6,xmm1,XMMWORD PTR 16[ebp] - xor edx,ecx - mov edi,DWORD PTR 28[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 20[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 8[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 4[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 60[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 16[esp] - add eax,ecx - vmovdqa XMMWORD PTR 48[esp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 20[esp] - vpalignr xmm7,xmm1,xmm0,4 - xor edx,ecx - mov edi,DWORD PTR 24[esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 16[esp],ecx - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 4[esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR [esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 28[esp] - vpshufd xmm7,xmm1,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD PTR 64[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD PTR 12[esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 16[esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD PTR 20[esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 12[esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR [esp] - vpaddd xmm2,xmm2,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 28[esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 24[esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD PTR 68[esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD PTR 8[esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 12[esp] - vpaddd xmm2,xmm2,xmm7 - xor edx,ecx - mov edi,DWORD PTR 16[esp] - xor esi,edi - vpshufd xmm7,xmm2,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 8[esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 28[esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 24[esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 20[esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD PTR 72[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD PTR 4[esp] - add ebx,ecx - vpaddd xmm2,xmm2,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 8[esp] - vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp] - xor edx,ecx - mov edi,DWORD PTR 12[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 4[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 24[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 20[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 16[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 76[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR [esp] - add eax,ecx - vmovdqa XMMWORD PTR 64[esp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 4[esp] - vpalignr xmm7,xmm2,xmm1,4 - xor edx,ecx - mov edi,DWORD PTR 8[esp] - xor esi,edi - vpsrld xmm6,xmm4,7 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR [esp],ecx - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrld xmm7,xmm4,3 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 20[esp] - vpslld xmm5,xmm4,14 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 16[esp],eax - vpxor xmm4,xmm7,xmm6 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 12[esp] - vpshufd xmm7,xmm2,250 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpsrld xmm6,xmm6,11 - add edx,DWORD PTR 80[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpxor xmm4,xmm4,xmm5 - add ebx,edx - add edx,DWORD PTR 28[esp] - add ebx,ecx - vpslld xmm5,xmm5,11 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR [esp] - vpxor xmm4,xmm4,xmm6 - xor edx,ecx - mov edi,DWORD PTR 4[esp] - xor esi,edi - vpsrld xmm6,xmm7,10 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 28[esp],ecx - vpxor xmm4,xmm4,xmm5 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 16[esp] - vpaddd xmm3,xmm3,xmm4 - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 12[esp],ebx - vpxor xmm6,xmm6,xmm5 - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 8[esp] - vpsrlq xmm7,xmm7,19 - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - add edx,DWORD PTR 84[esp] - xor eax,edi - shrd ecx,ecx,2 - vpshufd xmm7,xmm6,132 - add eax,edx - add edx,DWORD PTR 24[esp] - add eax,ecx - vpsrldq xmm7,xmm7,8 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 28[esp] - vpaddd xmm3,xmm3,xmm7 - xor edx,ecx - mov edi,DWORD PTR [esp] - xor esi,edi - vpshufd xmm7,xmm3,80 - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 24[esp],ecx - vpsrld xmm6,xmm7,10 - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - vpsrlq xmm5,xmm7,17 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 12[esp] - vpxor xmm6,xmm6,xmm5 - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 8[esp],eax - vpsrlq xmm7,xmm7,19 - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 4[esp] - vpxor xmm6,xmm6,xmm7 - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - vpshufd xmm7,xmm6,232 - add edx,DWORD PTR 88[esp] - xor ebx,edi - shrd ecx,ecx,2 - vpslldq xmm7,xmm7,8 - add ebx,edx - add edx,DWORD PTR 20[esp] - add ebx,ecx - vpaddd xmm3,xmm3,xmm7 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 24[esp] - vpaddd xmm6,xmm3,XMMWORD PTR 48[ebp] - xor edx,ecx - mov edi,DWORD PTR 28[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 20[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 8[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 4[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 92[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 16[esp] - add eax,ecx - vmovdqa XMMWORD PTR 80[esp],xmm6 - cmp DWORD PTR 64[ebp],66051 - jne $L016avx_00_47 - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 20[esp] - xor edx,ecx - mov edi,DWORD PTR 24[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 16[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 4[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 28[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 32[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 12[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 16[esp] - xor edx,ecx - mov edi,DWORD PTR 20[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 12[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 28[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 24[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 36[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 8[esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 12[esp] - xor edx,ecx - mov edi,DWORD PTR 16[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 8[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 28[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 24[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 20[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 40[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 4[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 8[esp] - xor edx,ecx - mov edi,DWORD PTR 12[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 4[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 24[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 20[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 16[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 44[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 4[esp] - xor edx,ecx - mov edi,DWORD PTR 8[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 20[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 16[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 12[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 48[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 28[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR [esp] - xor edx,ecx - mov edi,DWORD PTR 4[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 28[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 16[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 12[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 8[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 52[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 24[esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 28[esp] - xor edx,ecx - mov edi,DWORD PTR [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 24[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 12[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 8[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 4[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 56[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 20[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 24[esp] - xor edx,ecx - mov edi,DWORD PTR 28[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 20[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 8[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 4[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 60[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 16[esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 20[esp] - xor edx,ecx - mov edi,DWORD PTR 24[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 16[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 4[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR [esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 28[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 64[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 12[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 16[esp] - xor edx,ecx - mov edi,DWORD PTR 20[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 12[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR [esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 28[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 24[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 68[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 8[esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 12[esp] - xor edx,ecx - mov edi,DWORD PTR 16[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 8[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 28[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 24[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 20[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 72[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 4[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 8[esp] - xor edx,ecx - mov edi,DWORD PTR 12[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 4[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 24[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 20[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 16[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 76[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR [esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 4[esp] - xor edx,ecx - mov edi,DWORD PTR 8[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR [esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 20[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 16[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 12[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 80[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 28[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR [esp] - xor edx,ecx - mov edi,DWORD PTR 4[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 28[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 16[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 12[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR 8[esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 84[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 24[esp] - add eax,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 28[esp] - xor edx,ecx - mov edi,DWORD PTR [esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 24[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,eax - add edx,edi - mov edi,DWORD PTR 12[esp] - mov esi,eax - shrd ecx,ecx,9 - mov DWORD PTR 8[esp],eax - xor ecx,eax - xor eax,edi - add edx,DWORD PTR 4[esp] - shrd ecx,ecx,11 - and ebx,eax - xor ecx,esi - add edx,DWORD PTR 88[esp] - xor ebx,edi - shrd ecx,ecx,2 - add ebx,edx - add edx,DWORD PTR 20[esp] - add ebx,ecx - mov ecx,edx - shrd edx,edx,14 - mov esi,DWORD PTR 24[esp] - xor edx,ecx - mov edi,DWORD PTR 28[esp] - xor esi,edi - shrd edx,edx,5 - and esi,ecx - mov DWORD PTR 20[esp],ecx - xor edx,ecx - xor edi,esi - shrd edx,edx,6 - mov ecx,ebx - add edx,edi - mov edi,DWORD PTR 8[esp] - mov esi,ebx - shrd ecx,ecx,9 - mov DWORD PTR 4[esp],ebx - xor ecx,ebx - xor ebx,edi - add edx,DWORD PTR [esp] - shrd ecx,ecx,11 - and eax,ebx - xor ecx,esi - add edx,DWORD PTR 92[esp] - xor eax,edi - shrd ecx,ecx,2 - add eax,edx - add edx,DWORD PTR 16[esp] - add eax,ecx - mov esi,DWORD PTR 96[esp] - xor ebx,edi - mov ecx,DWORD PTR 12[esp] - add eax,DWORD PTR [esi] - add ebx,DWORD PTR 4[esi] - add edi,DWORD PTR 8[esi] - add ecx,DWORD PTR 12[esi] - mov DWORD PTR [esi],eax - mov DWORD PTR 4[esi],ebx - mov DWORD PTR 8[esi],edi - mov DWORD PTR 12[esi],ecx - mov DWORD PTR 4[esp],ebx - xor ebx,edi - mov DWORD PTR 8[esp],edi - mov DWORD PTR 12[esp],ecx - mov edi,DWORD PTR 20[esp] - mov ecx,DWORD PTR 24[esp] - add edx,DWORD PTR 16[esi] - add edi,DWORD PTR 20[esi] - add ecx,DWORD PTR 24[esi] - mov DWORD PTR 16[esi],edx - mov DWORD PTR 20[esi],edi - mov DWORD PTR 20[esp],edi - mov edi,DWORD PTR 28[esp] - mov DWORD PTR 24[esi],ecx - add edi,DWORD PTR 28[esi] - mov DWORD PTR 24[esp],ecx - mov DWORD PTR 28[esi],edi - mov DWORD PTR 28[esp],edi - mov edi,DWORD PTR 100[esp] - vmovdqa xmm7,XMMWORD PTR 64[ebp] - sub ebp,192 - cmp edi,DWORD PTR 104[esp] - jb $L015grand_avx - mov esp,DWORD PTR 108[esp] - vzeroall - pop edi - pop esi - pop ebx - pop ebp - ret -ALIGN 32 -$L014AVX_BMI: - lea esp,DWORD PTR [esp-96] - vzeroall - mov eax,DWORD PTR [esi] - mov ebx,DWORD PTR 4[esi] - mov ecx,DWORD PTR 8[esi] - mov edi,DWORD PTR 12[esi] - mov DWORD PTR 4[esp],ebx - xor ebx,ecx - mov DWORD PTR 8[esp],ecx - mov DWORD PTR 12[esp],edi - mov edx,DWORD PTR 16[esi] - mov edi,DWORD PTR 20[esi] - mov ecx,DWORD PTR 24[esi] - mov esi,DWORD PTR 28[esi] - mov DWORD PTR 20[esp],edi - mov edi,DWORD PTR 100[esp] - mov DWORD PTR 24[esp],ecx - mov DWORD PTR 28[esp],esi - vmovdqa xmm7,XMMWORD PTR 256[ebp] - jmp $L017grand_avx_bmi -ALIGN 32 -$L017grand_avx_bmi: - vmovdqu xmm0,XMMWORD PTR [edi] - vmovdqu xmm1,XMMWORD PTR 16[edi] - vmovdqu xmm2,XMMWORD PTR 32[edi] - vmovdqu xmm3,XMMWORD PTR 48[edi] - add edi,64 - vpshufb xmm0,xmm0,xmm7 - mov DWORD PTR 100[esp],edi - vpshufb xmm1,xmm1,xmm7 - vpshufb xmm2,xmm2,xmm7 - vpaddd xmm4,xmm0,XMMWORD PTR [ebp] - vpshufb xmm3,xmm3,xmm7 - vpaddd xmm5,xmm1,XMMWORD PTR 16[ebp] - vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp] - vpaddd xmm7,xmm3,XMMWORD PTR 48[ebp] - vmovdqa XMMWORD PTR 32[esp],xmm4 - vmovdqa XMMWORD PTR 48[esp],xmm5 - vmovdqa XMMWORD PTR 64[esp],xmm6 - vmovdqa XMMWORD PTR 80[esp],xmm7 - jmp $L018avx_bmi_00_47 -ALIGN 16 -$L018avx_bmi_00_47: - add ebp,64 - vpalignr xmm4,xmm1,xmm0,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 16[esp],edx - vpalignr xmm7,xmm3,xmm2,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 24[esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD PTR 20[esp] - mov DWORD PTR [esp],eax - vpaddd xmm0,xmm0,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD PTR 4[esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD PTR 28[esp] - and ebx,eax - add edx,DWORD PTR 32[esp] - vpshufd xmm7,xmm3,250 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 12[esp] - vpsrld xmm6,xmm6,11 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD PTR 12[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD PTR 20[esp] - xor ecx,edi - and edx,DWORD PTR 16[esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD PTR 28[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD PTR [esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD PTR 24[esp] - and eax,ebx - add edx,DWORD PTR 36[esp] - vpaddd xmm0,xmm0,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD PTR 8[esp] - vpxor xmm6,xmm6,xmm5 - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD PTR 8[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD PTR 16[esp] - xor ecx,edi - and edx,DWORD PTR 12[esp] - vpshufd xmm7,xmm6,132 - mov DWORD PTR 24[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm0,xmm0,xmm7 - mov edi,DWORD PTR 28[esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm0,80 - add edx,DWORD PTR 20[esp] - and ebx,eax - add edx,DWORD PTR 40[esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 4[esp] - vpsrlq xmm5,xmm7,17 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD PTR 4[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD PTR 12[esp] - xor ecx,edi - and edx,DWORD PTR 8[esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD PTR 20[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD PTR 24[esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm0,xmm0,xmm7 - add edx,DWORD PTR 16[esp] - and eax,ebx - add edx,DWORD PTR 44[esp] - vpaddd xmm6,xmm0,XMMWORD PTR [ebp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR [esp] - lea eax,DWORD PTR [ecx*1+eax] - vmovdqa XMMWORD PTR 32[esp],xmm6 - vpalignr xmm4,xmm2,xmm1,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR [esp],edx - vpalignr xmm7,xmm0,xmm3,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 8[esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD PTR 4[esp] - mov DWORD PTR 16[esp],eax - vpaddd xmm1,xmm1,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD PTR 20[esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD PTR 12[esp] - and ebx,eax - add edx,DWORD PTR 48[esp] - vpshufd xmm7,xmm0,250 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 28[esp] - vpsrld xmm6,xmm6,11 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD PTR 28[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD PTR 4[esp] - xor ecx,edi - and edx,DWORD PTR [esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD PTR 12[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD PTR 16[esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD PTR 8[esp] - and eax,ebx - add edx,DWORD PTR 52[esp] - vpaddd xmm1,xmm1,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD PTR 24[esp] - vpxor xmm6,xmm6,xmm5 - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD PTR 24[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD PTR [esp] - xor ecx,edi - and edx,DWORD PTR 28[esp] - vpshufd xmm7,xmm6,132 - mov DWORD PTR 8[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm1,xmm1,xmm7 - mov edi,DWORD PTR 12[esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm1,80 - add edx,DWORD PTR 4[esp] - and ebx,eax - add edx,DWORD PTR 56[esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 20[esp] - vpsrlq xmm5,xmm7,17 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD PTR 20[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD PTR 28[esp] - xor ecx,edi - and edx,DWORD PTR 24[esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD PTR 4[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD PTR 8[esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm1,xmm1,xmm7 - add edx,DWORD PTR [esp] - and eax,ebx - add edx,DWORD PTR 60[esp] - vpaddd xmm6,xmm1,XMMWORD PTR 16[ebp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 16[esp] - lea eax,DWORD PTR [ecx*1+eax] - vmovdqa XMMWORD PTR 48[esp],xmm6 - vpalignr xmm4,xmm3,xmm2,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 16[esp],edx - vpalignr xmm7,xmm1,xmm0,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 24[esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD PTR 20[esp] - mov DWORD PTR [esp],eax - vpaddd xmm2,xmm2,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD PTR 4[esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD PTR 28[esp] - and ebx,eax - add edx,DWORD PTR 64[esp] - vpshufd xmm7,xmm1,250 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 12[esp] - vpsrld xmm6,xmm6,11 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD PTR 12[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD PTR 20[esp] - xor ecx,edi - and edx,DWORD PTR 16[esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD PTR 28[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD PTR [esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD PTR 24[esp] - and eax,ebx - add edx,DWORD PTR 68[esp] - vpaddd xmm2,xmm2,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD PTR 8[esp] - vpxor xmm6,xmm6,xmm5 - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD PTR 8[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD PTR 16[esp] - xor ecx,edi - and edx,DWORD PTR 12[esp] - vpshufd xmm7,xmm6,132 - mov DWORD PTR 24[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm2,xmm2,xmm7 - mov edi,DWORD PTR 28[esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm2,80 - add edx,DWORD PTR 20[esp] - and ebx,eax - add edx,DWORD PTR 72[esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 4[esp] - vpsrlq xmm5,xmm7,17 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD PTR 4[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD PTR 12[esp] - xor ecx,edi - and edx,DWORD PTR 8[esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD PTR 20[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD PTR 24[esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm2,xmm2,xmm7 - add edx,DWORD PTR 16[esp] - and eax,ebx - add edx,DWORD PTR 76[esp] - vpaddd xmm6,xmm2,XMMWORD PTR 32[ebp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR [esp] - lea eax,DWORD PTR [ecx*1+eax] - vmovdqa XMMWORD PTR 64[esp],xmm6 - vpalignr xmm4,xmm0,xmm3,4 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR [esp],edx - vpalignr xmm7,xmm2,xmm1,4 - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 8[esp] - vpsrld xmm6,xmm4,7 - xor ecx,edi - and edx,DWORD PTR 4[esp] - mov DWORD PTR 16[esp],eax - vpaddd xmm3,xmm3,xmm7 - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrld xmm7,xmm4,3 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpslld xmm5,xmm4,14 - mov edi,DWORD PTR 20[esp] - xor ecx,esi - xor eax,edi - vpxor xmm4,xmm7,xmm6 - add edx,DWORD PTR 12[esp] - and ebx,eax - add edx,DWORD PTR 80[esp] - vpshufd xmm7,xmm2,250 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 28[esp] - vpsrld xmm6,xmm6,11 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm4,xmm4,xmm5 - mov DWORD PTR 28[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpslld xmm5,xmm5,11 - andn esi,edx,DWORD PTR 4[esp] - xor ecx,edi - and edx,DWORD PTR [esp] - vpxor xmm4,xmm4,xmm6 - mov DWORD PTR 12[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpsrld xmm6,xmm7,10 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpxor xmm4,xmm4,xmm5 - mov edi,DWORD PTR 16[esp] - xor ecx,esi - xor ebx,edi - vpsrlq xmm5,xmm7,17 - add edx,DWORD PTR 8[esp] - and eax,ebx - add edx,DWORD PTR 84[esp] - vpaddd xmm3,xmm3,xmm4 - xor eax,edi - add ecx,edx - add edx,DWORD PTR 24[esp] - vpxor xmm6,xmm6,xmm5 - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - vpsrlq xmm7,xmm7,19 - mov DWORD PTR 24[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpxor xmm6,xmm6,xmm7 - andn esi,edx,DWORD PTR [esp] - xor ecx,edi - and edx,DWORD PTR 28[esp] - vpshufd xmm7,xmm6,132 - mov DWORD PTR 8[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - vpsrldq xmm7,xmm7,8 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - vpaddd xmm3,xmm3,xmm7 - mov edi,DWORD PTR 12[esp] - xor ecx,esi - xor eax,edi - vpshufd xmm7,xmm3,80 - add edx,DWORD PTR 4[esp] - and ebx,eax - add edx,DWORD PTR 88[esp] - vpsrld xmm6,xmm7,10 - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 20[esp] - vpsrlq xmm5,xmm7,17 - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - vpxor xmm6,xmm6,xmm5 - mov DWORD PTR 20[esp],edx - rorx edi,edx,25 - xor ecx,esi - vpsrlq xmm7,xmm7,19 - andn esi,edx,DWORD PTR 28[esp] - xor ecx,edi - and edx,DWORD PTR 24[esp] - vpxor xmm6,xmm6,xmm7 - mov DWORD PTR 4[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - vpshufd xmm7,xmm6,232 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - vpslldq xmm7,xmm7,8 - mov edi,DWORD PTR 8[esp] - xor ecx,esi - xor ebx,edi - vpaddd xmm3,xmm3,xmm7 - add edx,DWORD PTR [esp] - and eax,ebx - add edx,DWORD PTR 92[esp] - vpaddd xmm6,xmm3,XMMWORD PTR 48[ebp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 16[esp] - lea eax,DWORD PTR [ecx*1+eax] - vmovdqa XMMWORD PTR 80[esp],xmm6 - cmp DWORD PTR 64[ebp],66051 - jne $L018avx_bmi_00_47 - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 16[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 24[esp] - xor ecx,edi - and edx,DWORD PTR 20[esp] - mov DWORD PTR [esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 4[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 28[esp] - and ebx,eax - add edx,DWORD PTR 32[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 12[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 12[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 20[esp] - xor ecx,edi - and edx,DWORD PTR 16[esp] - mov DWORD PTR 28[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR [esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 24[esp] - and eax,ebx - add edx,DWORD PTR 36[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 8[esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 8[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 16[esp] - xor ecx,edi - and edx,DWORD PTR 12[esp] - mov DWORD PTR 24[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 28[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 20[esp] - and ebx,eax - add edx,DWORD PTR 40[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 4[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 4[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 12[esp] - xor ecx,edi - and edx,DWORD PTR 8[esp] - mov DWORD PTR 20[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 24[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 16[esp] - and eax,ebx - add edx,DWORD PTR 44[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR [esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR [esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 8[esp] - xor ecx,edi - and edx,DWORD PTR 4[esp] - mov DWORD PTR 16[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 20[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 12[esp] - and ebx,eax - add edx,DWORD PTR 48[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 28[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 28[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 4[esp] - xor ecx,edi - and edx,DWORD PTR [esp] - mov DWORD PTR 12[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 16[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 8[esp] - and eax,ebx - add edx,DWORD PTR 52[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 24[esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 24[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR [esp] - xor ecx,edi - and edx,DWORD PTR 28[esp] - mov DWORD PTR 8[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 12[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 4[esp] - and ebx,eax - add edx,DWORD PTR 56[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 20[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 20[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 28[esp] - xor ecx,edi - and edx,DWORD PTR 24[esp] - mov DWORD PTR 4[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 8[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR [esp] - and eax,ebx - add edx,DWORD PTR 60[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 16[esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 16[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 24[esp] - xor ecx,edi - and edx,DWORD PTR 20[esp] - mov DWORD PTR [esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 4[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 28[esp] - and ebx,eax - add edx,DWORD PTR 64[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 12[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 12[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 20[esp] - xor ecx,edi - and edx,DWORD PTR 16[esp] - mov DWORD PTR 28[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR [esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 24[esp] - and eax,ebx - add edx,DWORD PTR 68[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 8[esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 8[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 16[esp] - xor ecx,edi - and edx,DWORD PTR 12[esp] - mov DWORD PTR 24[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 28[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 20[esp] - and ebx,eax - add edx,DWORD PTR 72[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 4[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 4[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 12[esp] - xor ecx,edi - and edx,DWORD PTR 8[esp] - mov DWORD PTR 20[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 24[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 16[esp] - and eax,ebx - add edx,DWORD PTR 76[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR [esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR [esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 8[esp] - xor ecx,edi - and edx,DWORD PTR 4[esp] - mov DWORD PTR 16[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 20[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 12[esp] - and ebx,eax - add edx,DWORD PTR 80[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 28[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 28[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 4[esp] - xor ecx,edi - and edx,DWORD PTR [esp] - mov DWORD PTR 12[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 16[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR 8[esp] - and eax,ebx - add edx,DWORD PTR 84[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 24[esp] - lea eax,DWORD PTR [ecx*1+eax] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 24[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR [esp] - xor ecx,edi - and edx,DWORD PTR 28[esp] - mov DWORD PTR 8[esp],eax - or edx,esi - rorx edi,eax,2 - rorx esi,eax,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,eax,22 - xor esi,edi - mov edi,DWORD PTR 12[esp] - xor ecx,esi - xor eax,edi - add edx,DWORD PTR 4[esp] - and ebx,eax - add edx,DWORD PTR 88[esp] - xor ebx,edi - add ecx,edx - add edx,DWORD PTR 20[esp] - lea ebx,DWORD PTR [ecx*1+ebx] - rorx ecx,edx,6 - rorx esi,edx,11 - mov DWORD PTR 20[esp],edx - rorx edi,edx,25 - xor ecx,esi - andn esi,edx,DWORD PTR 28[esp] - xor ecx,edi - and edx,DWORD PTR 24[esp] - mov DWORD PTR 4[esp],ebx - or edx,esi - rorx edi,ebx,2 - rorx esi,ebx,13 - lea edx,DWORD PTR [ecx*1+edx] - rorx ecx,ebx,22 - xor esi,edi - mov edi,DWORD PTR 8[esp] - xor ecx,esi - xor ebx,edi - add edx,DWORD PTR [esp] - and eax,ebx - add edx,DWORD PTR 92[esp] - xor eax,edi - add ecx,edx - add edx,DWORD PTR 16[esp] - lea eax,DWORD PTR [ecx*1+eax] - mov esi,DWORD PTR 96[esp] - xor ebx,edi - mov ecx,DWORD PTR 12[esp] - add eax,DWORD PTR [esi] - add ebx,DWORD PTR 4[esi] - add edi,DWORD PTR 8[esi] - add ecx,DWORD PTR 12[esi] - mov DWORD PTR [esi],eax - mov DWORD PTR 4[esi],ebx - mov DWORD PTR 8[esi],edi - mov DWORD PTR 12[esi],ecx - mov DWORD PTR 4[esp],ebx - xor ebx,edi - mov DWORD PTR 8[esp],edi - mov DWORD PTR 12[esp],ecx - mov edi,DWORD PTR 20[esp] - mov ecx,DWORD PTR 24[esp] - add edx,DWORD PTR 16[esi] - add edi,DWORD PTR 20[esi] - add ecx,DWORD PTR 24[esi] - mov DWORD PTR 16[esi],edx - mov DWORD PTR 20[esi],edi - mov DWORD PTR 20[esp],edi - mov edi,DWORD PTR 28[esp] - mov DWORD PTR 24[esi],ecx - add edi,DWORD PTR 28[esi] - mov DWORD PTR 24[esp],ecx - mov DWORD PTR 28[esi],edi - mov DWORD PTR 28[esp],edi - mov edi,DWORD PTR 100[esp] - vmovdqa xmm7,XMMWORD PTR 64[ebp] - sub ebp,192 - cmp edi,DWORD PTR 104[esp] - jb $L017grand_avx_bmi - mov esp,DWORD PTR 108[esp] - vzeroall - pop edi - pop esi - pop ebx - pop ebp - ret _sha256_block_data_order ENDP .text$ ENDS .bss SEGMENT 'BSS' diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s index 0bdfe91fc530bc..c21cce10f5db9e 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aes-x86_64.s @@ -81,8 +81,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $65280,%edi - andl $65280,%ebp + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp xorl %edi,%r10d xorl %ebp,%r11d @@ -94,8 +94,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rsi,8),%esi movl 0(%r14,%rdi,8),%edi - andl $65280,%esi - andl $65280,%edi + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi shrl $16,%ebx xorl %esi,%r12d xorl %edi,%r8d @@ -108,9 +108,9 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $16711680,%edi - andl $16711680,%ebp + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp xorl %esi,%r10d xorl %edi,%r11d @@ -123,9 +123,9 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 2(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $4278190080,%edi - andl $4278190080,%ebp + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp xorl %esi,%r8d xorl %edi,%r10d @@ -138,8 +138,8 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax - andl $4278190080,%esi - andl $4278190080,%edi + andl $0xff000000,%esi + andl $0xff000000,%edi xorl %esi,%r12d xorl %edi,%r8d @@ -241,8 +241,8 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%edx cmpq 16(%rsp),%r15 je .Lenc_compact_done - movl $2155905152,%r10d - movl $2155905152,%r11d + movl $0x80808080,%r10d + movl $0x80808080,%r11d andl %eax,%r10d andl %ebx,%r11d movl %r10d,%esi @@ -253,10 +253,10 @@ _x86_64_AES_encrypt_compact: leal (%rbx,%rbx,1),%r9d subl %r10d,%esi subl %r11d,%edi - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %eax,%r10d movl %ebx,%r11d xorl %esi,%r8d @@ -264,9 +264,9 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl $2155905152,%r12d + movl $0x80808080,%r12d roll $24,%eax - movl $2155905152,%ebp + movl $0x80808080,%ebp roll $24,%ebx andl %ecx,%r12d andl %edx,%ebp @@ -289,10 +289,10 @@ _x86_64_AES_encrypt_compact: xorl %r10d,%eax xorl %r11d,%ebx - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %ecx,%r12d movl %edx,%ebp xorl %esi,%r8d @@ -345,7 +345,7 @@ AES_encrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -370,7 +370,7 @@ AES_encrypt: leaq .LAES_Te+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 call _x86_64_AES_encrypt_compact @@ -792,7 +792,7 @@ AES_decrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -817,7 +817,7 @@ AES_decrypt: leaq .LAES_Td+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 shrq $3,%rbp addq %rbp,%r14 @@ -1333,9 +1333,9 @@ AES_cbc_encrypt: movq %r14,%r10 leaq 2304(%r14),%r11 movq %r15,%r12 - andq $4095,%r10 - andq $4095,%r11 - andq $4095,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 cmpq %r11,%r12 jb .Lcbc_te_break_out @@ -1344,7 +1344,7 @@ AES_cbc_encrypt: jmp .Lcbc_te_ok .Lcbc_te_break_out: subq %r10,%r12 - andq $4095,%r12 + andq $0xFFF,%r12 addq $320,%r12 subq %r12,%r15 .align 4 @@ -1370,7 +1370,7 @@ AES_cbc_encrypt: movq %r15,%r10 subq %r14,%r10 - andq $4095,%r10 + andq $0xfff,%r10 cmpq $2304,%r10 jb .Lcbc_do_ecopy cmpq $4096-248,%r10 @@ -1557,7 +1557,7 @@ AES_cbc_encrypt: leaq -88-63(%rcx),%r10 subq %rbp,%r10 negq %r10 - andq $960,%r10 + andq $0x3c0,%r10 subq %r10,%rbp xchgq %rsp,%rbp @@ -1586,7 +1586,7 @@ AES_cbc_encrypt: leaq 2048(%r14),%r14 leaq 768-8(%rsp),%rax subq %r14,%rax - andq $768,%rax + andq $0x300,%rax leaq (%r14,%rax,1),%r14 cmpq $0,%rbx diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s index d4ed2047c6db9c..edbd5cb343c327 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-sha1-x86_64.s @@ -1392,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext: movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 jmp .Loop_shaext .align 16 @@ -1672,8 +1672,8 @@ aesni_cbc_sha1_enc_shaext: leaq 64(%rdi),%rdi jnz .Loop_shaext - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s index 6573fe4be3494d..fcf42adbb4b83a 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/aesni-x86_64.s @@ -503,7 +503,7 @@ aesni_ecb_encrypt: testl %r8d,%r8d jz .Lecb_decrypt - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_enc_tail movdqu (%rdi),%xmm2 @@ -515,7 +515,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_enc_loop8_enter .align 16 .Lecb_enc_loop8: @@ -543,7 +543,7 @@ aesni_ecb_encrypt: call _aesni_encrypt8 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_enc_loop8 movups %xmm2,(%rsi) @@ -557,22 +557,22 @@ aesni_ecb_encrypt: movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_enc_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_enc_one movups 16(%rdi),%xmm3 je .Lecb_enc_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_enc_three movups 48(%rdi),%xmm5 je .Lecb_enc_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_enc_five movups 80(%rdi),%xmm7 je .Lecb_enc_six @@ -646,7 +646,7 @@ aesni_ecb_encrypt: .align 16 .Lecb_decrypt: - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_dec_tail movdqu (%rdi),%xmm2 @@ -658,7 +658,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_dec_loop8_enter .align 16 .Lecb_dec_loop8: @@ -687,7 +687,7 @@ aesni_ecb_encrypt: call _aesni_decrypt8 movups (%r11),%xmm0 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) @@ -709,22 +709,22 @@ aesni_ecb_encrypt: movups %xmm9,112(%rsi) pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_dec_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_dec_one movups 16(%rdi),%xmm3 je .Lecb_dec_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_dec_three movups 48(%rdi),%xmm5 je .Lecb_dec_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_dec_five movups 80(%rdi),%xmm7 je .Lecb_dec_six @@ -1598,7 +1598,7 @@ aesni_xts_encrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -1697,7 +1697,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 .align 32 .Lxts_enc_loop6: @@ -1836,13 +1836,13 @@ aesni_xts_encrypt: jz .Lxts_enc_done pxor %xmm0,%xmm11 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_enc_one pxor %xmm0,%xmm12 je .Lxts_enc_two pxor %xmm0,%xmm13 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_enc_three pxor %xmm0,%xmm14 je .Lxts_enc_four @@ -2069,7 +2069,7 @@ aesni_xts_decrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -2168,7 +2168,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 .align 32 .Lxts_dec_loop6: @@ -2308,13 +2308,13 @@ aesni_xts_decrypt: jz .Lxts_dec_done pxor %xmm0,%xmm12 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_dec_one pxor %xmm0,%xmm13 je .Lxts_dec_two pxor %xmm0,%xmm14 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_dec_three je .Lxts_dec_four @@ -2345,7 +2345,7 @@ aesni_xts_decrypt: pcmpgtd %xmm15,%xmm14 movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - pshufd $19,%xmm14,%xmm11 + pshufd $0x13,%xmm14,%xmm11 andq $15,%r9 jz .Lxts_dec_ret @@ -2634,7 +2634,7 @@ aesni_cbc_encrypt: leaq -8(%rax),%rbp movups (%r8),%xmm10 movl %r10d,%eax - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movups (%rcx),%xmm0 @@ -2650,14 +2650,14 @@ aesni_cbc_encrypt: movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 movl OPENSSL_ia32cap_P+4(%rip),%r9d - cmpq $112,%rdx + cmpq $0x70,%rdx jbe .Lcbc_dec_six_or_seven andl $71303168,%r9d - subq $80,%rdx + subq $0x50,%rdx cmpl $4194304,%r9d je .Lcbc_dec_loop6_enter - subq $32,%rdx + subq $0x20,%rdx leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 @@ -2672,7 +2672,7 @@ aesni_cbc_encrypt: movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 xorq %r11,%r11 - cmpq $112,%rdx + cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 @@ -2857,21 +2857,21 @@ aesni_cbc_encrypt: movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi - subq $128,%rdx + subq $0x80,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx - addq $112,%rdx + addq $0x70,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movaps %xmm11,%xmm2 .Lcbc_dec_six_or_seven: - cmpq $96,%rdx + cmpq $0x60,%rdx ja .Lcbc_dec_seven movaps %xmm7,%xmm8 @@ -2964,33 +2964,33 @@ aesni_cbc_encrypt: movl %r10d,%eax movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - subq $96,%rdx + subq $0x60,%rdx ja .Lcbc_dec_loop6 movdqa %xmm7,%xmm2 - addq $80,%rdx + addq $0x50,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_tail: movups (%rdi),%xmm2 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 movaps %xmm2,%xmm11 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 movaps %xmm3,%xmm12 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 movaps %xmm4,%xmm13 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 @@ -3015,7 +3015,7 @@ aesni_cbc_encrypt: movdqa %xmm6,%xmm2 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 - subq $16,%rdx + subq $0x10,%rdx jmp .Lcbc_dec_tail_collected .align 16 @@ -3332,7 +3332,7 @@ __aesni_set_encrypt_key: pslldq $4,%xmm0 pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0xff,%xmm0,%xmm3 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 @@ -3419,7 +3419,7 @@ __aesni_set_encrypt_key: decl %r10d jz .Ldone_key256 - pshufd $255,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -3462,11 +3462,11 @@ __aesni_set_encrypt_key: movups %xmm0,(%rax) leaq 16(%rax),%rax .Lkey_expansion_128_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3477,25 +3477,25 @@ __aesni_set_encrypt_key: .Lkey_expansion_192a_cold: movaps %xmm2,%xmm5 .Lkey_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 pslldq $4,%xmm3 xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 + pshufd $0b01010101,%xmm1,%xmm1 pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0b11111111,%xmm0,%xmm3 pxor %xmm3,%xmm2 .byte 0xf3,0xc3 .align 16 .Lkey_expansion_192b: movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 + shufps $0b01000100,%xmm0,%xmm5 movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 + shufps $0b01001110,%xmm2,%xmm3 movups %xmm3,16(%rax) leaq 32(%rax),%rax jmp .Lkey_expansion_192b_warm @@ -3505,11 +3505,11 @@ __aesni_set_encrypt_key: movups %xmm2,(%rax) leaq 16(%rax),%rax .Lkey_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3518,11 +3518,11 @@ __aesni_set_encrypt_key: movups %xmm0,(%rax) leaq 16(%rax),%rax - shufps $16,%xmm2,%xmm4 + shufps $0b00010000,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 + shufps $0b10001100,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 + shufps $0b10101010,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 .size aesni_set_encrypt_key,.-aesni_set_encrypt_key diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s index 5b363a5eef9020..0fd201167f647a 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/bsaes-x86_64.s @@ -324,45 +324,45 @@ _bsaes_encrypt8_bitslice: pxor %xmm2,%xmm5 decl %r10d jl .Lenc_done - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm3,%xmm9 + pshufd $0x93,%xmm3,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm5,%xmm10 + pshufd $0x93,%xmm5,%xmm10 pxor %xmm9,%xmm3 - pshufd $147,%xmm2,%xmm11 + pshufd $0x93,%xmm2,%xmm11 pxor %xmm10,%xmm5 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm2 - pshufd $147,%xmm1,%xmm13 + pshufd $0x93,%xmm1,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm1 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm2,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm5,%xmm11 - pshufd $78,%xmm2,%xmm7 + pshufd $0x4E,%xmm2,%xmm7 pxor %xmm1,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm3,%xmm10 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm1,%xmm5 + pshufd $0x4E,%xmm1,%xmm5 pxor %xmm11,%xmm7 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm12,%xmm8 pxor %xmm10,%xmm2 pxor %xmm14,%xmm6 @@ -796,24 +796,24 @@ _bsaes_decrypt8: decl %r10d jl .Ldec_done - pshufd $78,%xmm15,%xmm7 - pshufd $78,%xmm2,%xmm13 + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 pxor %xmm15,%xmm7 - pshufd $78,%xmm4,%xmm14 + pshufd $0x4E,%xmm4,%xmm14 pxor %xmm2,%xmm13 - pshufd $78,%xmm0,%xmm8 + pshufd $0x4E,%xmm0,%xmm8 pxor %xmm4,%xmm14 - pshufd $78,%xmm5,%xmm9 + pshufd $0x4E,%xmm5,%xmm9 pxor %xmm0,%xmm8 - pshufd $78,%xmm3,%xmm10 + pshufd $0x4E,%xmm3,%xmm10 pxor %xmm5,%xmm9 pxor %xmm13,%xmm15 pxor %xmm13,%xmm0 - pshufd $78,%xmm1,%xmm11 + pshufd $0x4E,%xmm1,%xmm11 pxor %xmm3,%xmm10 pxor %xmm7,%xmm5 pxor %xmm8,%xmm3 - pshufd $78,%xmm6,%xmm12 + pshufd $0x4E,%xmm6,%xmm12 pxor %xmm1,%xmm11 pxor %xmm14,%xmm0 pxor %xmm9,%xmm1 @@ -827,45 +827,45 @@ _bsaes_decrypt8: pxor %xmm14,%xmm1 pxor %xmm14,%xmm6 pxor %xmm12,%xmm4 - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm5,%xmm9 + pshufd $0x93,%xmm5,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm3,%xmm10 + pshufd $0x93,%xmm3,%xmm10 pxor %xmm9,%xmm5 - pshufd $147,%xmm1,%xmm11 + pshufd $0x93,%xmm1,%xmm11 pxor %xmm10,%xmm3 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm1 - pshufd $147,%xmm2,%xmm13 + pshufd $0x93,%xmm2,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm2 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm1,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm3,%xmm11 - pshufd $78,%xmm1,%xmm7 + pshufd $0x4E,%xmm1,%xmm7 pxor %xmm2,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm5,%xmm10 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm2,%xmm3 + pshufd $0x4E,%xmm2,%xmm3 pxor %xmm11,%xmm7 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm12,%xmm8 pxor %xmm1,%xmm10 pxor %xmm14,%xmm6 @@ -1552,20 +1552,20 @@ bsaes_xts_encrypt: movdqa %xmm7,(%rax) andq $-16,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_enc_short jmp .Lxts_enc_loop .align 16 .Lxts_enc_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1573,7 +1573,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1582,7 +1582,7 @@ bsaes_xts_encrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1592,7 +1592,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1602,7 +1602,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1612,7 +1612,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1622,7 +1622,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -1666,20 +1666,20 @@ bsaes_xts_encrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_enc_loop .Lxts_enc_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_enc_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1687,7 +1687,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1698,7 +1698,7 @@ bsaes_xts_encrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_enc_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1710,7 +1710,7 @@ bsaes_xts_encrypt: cmpq $32,%r14 je .Lxts_enc_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1722,7 +1722,7 @@ bsaes_xts_encrypt: cmpq $48,%r14 je .Lxts_enc_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1734,7 +1734,7 @@ bsaes_xts_encrypt: cmpq $64,%r14 je .Lxts_enc_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1746,7 +1746,7 @@ bsaes_xts_encrypt: cmpq $80,%r14 je .Lxts_enc_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2011,20 +2011,20 @@ bsaes_xts_decrypt: shlq $4,%rax subq %rax,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_dec_short jmp .Lxts_dec_loop .align 16 .Lxts_dec_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2032,7 +2032,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2041,7 +2041,7 @@ bsaes_xts_decrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2051,7 +2051,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2061,7 +2061,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2071,7 +2071,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2081,7 +2081,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2125,20 +2125,20 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_dec_loop .Lxts_dec_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_dec_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2146,7 +2146,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2157,7 +2157,7 @@ bsaes_xts_decrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_dec_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2169,7 +2169,7 @@ bsaes_xts_decrypt: cmpq $32,%r14 je .Lxts_dec_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2181,7 +2181,7 @@ bsaes_xts_decrypt: cmpq $48,%r14 je .Lxts_dec_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2193,7 +2193,7 @@ bsaes_xts_decrypt: cmpq $64,%r14 je .Lxts_dec_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2205,7 +2205,7 @@ bsaes_xts_decrypt: cmpq $80,%r14 je .Lxts_dec_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2382,7 +2382,7 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 movdqa %xmm6,%xmm5 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s index b9d6df5134ec60..bf7c2b0b6f6b04 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/aes/vpaes-x86_64.s @@ -60,7 +60,7 @@ _vpaes_encrypt_core: addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 @@ -120,10 +120,10 @@ _vpaes_decrypt_core: pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa .Lk_dipt+16(%rip),%xmm0 - xorq $48,%r11 + xorq $0x30,%r11 leaq .Lk_dsbd(%rip),%r10 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 pxor %xmm5,%xmm2 movdqa .Lk_mc_forward+48(%rip),%xmm5 pxor %xmm2,%xmm0 @@ -242,7 +242,7 @@ _vpaes_schedule_core: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%rdx) - xorq $48,%r8 + xorq $0x30,%r8 .Lschedule_go: cmpl $192,%esi @@ -332,7 +332,7 @@ _vpaes_schedule_core: call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round @@ -399,8 +399,8 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 @@ -437,7 +437,7 @@ _vpaes_schedule_round: pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 @@ -596,7 +596,7 @@ _vpaes_schedule_mangle: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 - andq $48,%r8 + andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -614,7 +614,7 @@ vpaes_set_encrypt_key: movl %eax,240(%rdx) movl $0,%ecx - movl $48,%r8d + movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s index b43eb278e2cc06..4a1211329c67ea 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/rsaz-x86_64.s @@ -461,48 +461,94 @@ rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp .Lmul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -515,14 +561,12 @@ rsaz_512_mul_gather4: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -534,6 +578,35 @@ rsaz_512_mul_gather4: .align 32 .Loop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -542,7 +615,6 @@ rsaz_512_mul_gather4: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -551,7 +623,6 @@ rsaz_512_mul_gather4: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -560,7 +631,6 @@ rsaz_512_mul_gather4: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -569,7 +639,6 @@ rsaz_512_mul_gather4: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -594,7 +663,6 @@ rsaz_512_mul_gather4: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -602,7 +670,6 @@ rsaz_512_mul_gather4: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -617,8 +684,8 @@ rsaz_512_mul_gather4: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -667,7 +734,7 @@ rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp .Lmul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -703,30 +770,14 @@ rsaz_512_mul_scatter4: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1079,16 +1130,14 @@ __rsaz_512_mul: .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter @@ -1099,19 +1148,72 @@ rsaz_512_scatter4: .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s index eed057ad6a1a8a..f4e5337565bbc7 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-gf2m.s @@ -242,7 +242,7 @@ bn_GF2m_mul_2x2: movq %rcx,56(%rsp) movq %r8,64(%rsp) - movq $15,%r8 + movq $0xf,%r8 movq %rsi,%rax movq %rcx,%rbp call _mul_1x1 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s index b0981692130394..9e0019c163771c 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont.s @@ -633,20 +633,20 @@ bn_sqr8x_mont: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -656,58 +656,80 @@ bn_sqr8x_mont: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) .Lsqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz .Lsqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call bn_sqr8x_internal + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 + pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s index d2d1fbf4e7d635..8afe2496952006 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/bn/x86_64-mont5.s @@ -14,46 +14,151 @@ bn_mul_mont_gather5: .Lmul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq .Linc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) .Lmul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -62,29 +167,14 @@ bn_mul_mont_gather5: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -117,14 +207,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .L1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -138,33 +226,78 @@ bn_mul_mont_gather5: jmp .Louter .align 16 .Louter: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -200,15 +333,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .Linner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -255,6 +385,7 @@ bn_mul_mont_gather5: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -277,10 +408,10 @@ bn_mul4x_mont_gather5: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -290,19 +421,21 @@ bn_mul4x_mont_gather5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -318,6 +451,7 @@ bn_mul4x_mont_gather5: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -333,47 +467,141 @@ bn_mul4x_mont_gather5: .align 32 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 -.byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 -.byte 0x67 - por %xmm2,%xmm0 - movq -32(%r14),%xmm2 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -387,26 +615,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -415,7 +627,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -425,7 +637,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -435,7 +647,7 @@ mul4x_internal: .L1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -451,7 +663,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -481,7 +693,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -490,7 +702,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -500,7 +712,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -516,7 +728,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -529,8 +741,7 @@ mul4x_internal: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -541,6 +752,63 @@ mul4x_internal: .align 32 .Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -548,25 +816,11 @@ mul4x_internal: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -576,7 +830,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -588,7 +842,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x @@ -597,7 +851,7 @@ mul4x_internal: .Linner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -615,7 +869,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -649,7 +903,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -660,7 +914,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -670,7 +924,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -689,7 +943,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -704,9 +958,8 @@ mul4x_internal: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -717,16 +970,23 @@ mul4x_internal: cmpq 16+8(%rsp),%r12 jb .Louter4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry .size mul4x_internal,.-mul4x_internal .globl bn_power5 .type bn_power5,@function @@ -739,9 +999,9 @@ bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -751,19 +1011,20 @@ bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -791,10 +1052,15 @@ bn_power5: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1338,9 +1604,9 @@ __bn_sqr8x_internal: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1373,14 +1639,14 @@ sqr8x_reduction: .align 32 .L8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1389,7 +1655,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1398,7 +1664,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1407,7 +1673,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1415,7 +1681,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1423,7 +1689,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1441,7 +1707,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1467,14 +1733,14 @@ sqr8x_reduction: .L8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1483,7 +1749,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1491,7 +1757,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1499,7 +1765,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1507,7 +1773,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1515,7 +1781,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1533,7 +1799,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done @@ -1579,7 +1845,7 @@ sqr8x_reduction: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1597,44 +1863,62 @@ sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop - - subq %r15,%rcx + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function +.align 32 +__bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi movq %r9,%rcx - orq %rsi,%rax .byte 102,72,15,126,207 - xorq $1,%rax + negq %rax .byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp sarq $3+2,%rcx - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .type bn_from_montgomery,@function .align 32 @@ -1656,10 +1940,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1669,19 +1952,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1732,7 +2016,8 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1799,45 +2084,169 @@ bn_scatter5: .globl bn_gather5 .type bn_gather5,@function -.align 16 +.align 32 bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq .Lmagic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 +.LSEH_begin_bn_gather5: + +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) jmp .Lgather -.align 16 -.Lgather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz .Lgather + + leaq (%r10),%rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 .align 64 -.Lmagic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s index ac7da4dfc2d19e..1117381f316d9e 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/camellia/cmll-x86_64.s @@ -1624,7 +1624,7 @@ Camellia_cbc_encrypt: leaq -64-63(%rcx),%r10 subq %rsp,%r10 negq %r10 - andq $960,%r10 + andq $0x3C0,%r10 subq %r10,%rsp diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s index 393782329e0b91..7876e382994517 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/ec/ecp_nistz256-x86_64.s @@ -1121,6 +1121,7 @@ ecp_nistz256_point_double: pushq %r15 subq $160+8,%rsp +.Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -1341,7 +1342,7 @@ ecp_nistz256_point_add: por %xmm1,%xmm3 movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1351,7 +1352,7 @@ ecp_nistz256_point_add: movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1371,10 +1372,10 @@ ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 @@ -1383,6 +1384,7 @@ ecp_nistz256_point_add: movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi @@ -1474,7 +1476,7 @@ ecp_nistz256_point_add: testq %r8,%r8 jnz .Ladd_proceedq testq %r9,%r9 - jz .Ladd_proceedq + jz .Ladd_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 @@ -1486,6 +1488,13 @@ ecp_nistz256_point_add: movdqu %xmm0,80(%rdi) jmp .Ladd_doneq +.align 32 +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp .Lpoint_double_shortcutq + .align 32 .Ladd_proceedq: movq 0+64(%rsp),%rax @@ -1733,13 +1742,13 @@ ecp_nistz256_point_add_affine: por %xmm1,%xmm3 movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1755,13 +1764,13 @@ ecp_nistz256_point_add_affine: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s index 462ef7fe739f2f..e9ffdc2de2ea85 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/modes/ghash-x86_64.s @@ -20,14 +20,14 @@ gcm_gmult_4bit: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp .Loop1 .align 16 .Loop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -43,13 +43,13 @@ gcm_gmult_4bit: js .Lbreak1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -58,19 +58,19 @@ gcm_gmult_4bit: .align 16 .Lbreak1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -661,10 +661,10 @@ gcm_ghash_4bit: gcm_init_clmul: .L_init_clmul: movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 + pshufd $0b01001110,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 + pshufd $0b11111111,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 @@ -678,11 +678,11 @@ gcm_init_clmul: pxor %xmm5,%xmm2 - pshufd $78,%xmm2,%xmm6 + pshufd $0b01001110,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -718,8 +718,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm2,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 @@ -727,7 +727,7 @@ gcm_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -765,7 +765,7 @@ gcm_init_clmul: pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -801,8 +801,8 @@ gcm_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm5,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 @@ -822,7 +822,7 @@ gcm_gmult_clmul: movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -874,20 +874,20 @@ gcm_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 movl OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb .Lskip4x andl $71303168,%eax cmpl $4194304,%eax je .Lskip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -899,14 +899,14 @@ gcm_ghash_clmul: .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 @@ -921,12 +921,12 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 @@ -934,7 +934,7 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop @@ -949,14 +949,14 @@ gcm_ghash_clmul: movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 @@ -1000,7 +1000,7 @@ gcm_ghash_clmul: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 @@ -1010,14 +1010,14 @@ gcm_ghash_clmul: movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: @@ -1061,10 +1061,10 @@ gcm_ghash_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail .Lskip4x: @@ -1079,7 +1079,7 @@ gcm_ghash_clmul: pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 @@ -1087,7 +1087,7 @@ gcm_ghash_clmul: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop @@ -1096,7 +1096,7 @@ gcm_ghash_clmul: .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1134,7 +1134,7 @@ gcm_ghash_clmul: pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 + pshufd $0b01001110,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 @@ -1150,13 +1150,13 @@ gcm_ghash_clmul: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja .Lmod_loop .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1204,7 +1204,7 @@ gcm_ghash_clmul: .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s index 8a1e5e7b595379..4d25c99cf65bd3 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-mb-x86_64.s @@ -2599,10 +2599,10 @@ _shaext_shortcut: punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 - pshufd $63,%xmm7,%xmm1 - pshufd $127,%xmm7,%xmm9 - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00111111,%xmm7,%xmm1 + pshufd $0b01111111,%xmm7,%xmm9 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 jmp .Loop_shaext .align 32 @@ -2857,8 +2857,8 @@ _shaext_shortcut: .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 - pshufd $0,%xmm6,%xmm11 - pshufd $85,%xmm6,%xmm12 + pshufd $0x00,%xmm6,%xmm11 + pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 @@ -2888,8 +2888,8 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s index 38b7df19704ba2..38e9956cb68384 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha1-x86_64.s @@ -1240,9 +1240,9 @@ _shaext_shortcut: movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 + pshufd $0b00011011,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 @@ -1392,8 +1392,8 @@ _shaext_shortcut: jnz .Loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s index 7f8e35a92e0a15..7655283b9885b9 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-mb-x86_64.s @@ -2677,10 +2677,10 @@ _shaext_shortcut: punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 jmp .Loop_shaext .align 32 @@ -2712,11 +2712,11 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 @@ -2734,11 +2734,11 @@ _shaext_shortcut: .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2751,14 +2751,14 @@ _shaext_shortcut: movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2775,13 +2775,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2797,13 +2797,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2819,13 +2819,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2841,13 +2841,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2863,13 +2863,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2885,13 +2885,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2907,13 +2907,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2929,13 +2929,13 @@ _shaext_shortcut: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2951,13 +2951,13 @@ _shaext_shortcut: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2973,13 +2973,13 @@ _shaext_shortcut: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2995,13 +2995,13 @@ _shaext_shortcut: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -3018,13 +3018,13 @@ _shaext_shortcut: pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 @@ -3034,17 +3034,17 @@ _shaext_shortcut: cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 - pshufd $0,%xmm7,%xmm9 + pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 - pshufd $85,%xmm7,%xmm10 + pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 @@ -3066,10 +3066,10 @@ _shaext_shortcut: movl 280(%rsp),%edx - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s index d2951d8ea3be3a..ab16a7b618820c 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/sha/sha256-x86_64.s @@ -1754,9 +1754,9 @@ _shaext_shortcut: movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 @@ -1775,7 +1775,7 @@ _shaext_shortcut: .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 @@ -1784,7 +1784,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 @@ -1793,7 +1793,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1805,7 +1805,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1816,7 +1816,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1827,7 +1827,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1838,7 +1838,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1849,7 +1849,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1860,7 +1860,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1871,7 +1871,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1882,7 +1882,7 @@ _shaext_shortcut: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1893,7 +1893,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1904,7 +1904,7 @@ _shaext_shortcut: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1915,7 +1915,7 @@ _shaext_shortcut: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 @@ -1924,7 +1924,7 @@ _shaext_shortcut: movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 @@ -1933,7 +1933,7 @@ _shaext_shortcut: paddd %xmm6,%xmm0 nop .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 @@ -1942,9 +1942,9 @@ _shaext_shortcut: paddd %xmm9,%xmm1 jnz .Loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 diff --git a/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s b/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s index 656a5ce85576e0..0e81a290e3ec09 100644 --- a/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s +++ b/deps/openssl/asm_obsolete/x64-elf-gas/x86_64cpuid.s @@ -44,43 +44,43 @@ OPENSSL_ia32_cpuid: movl %eax,%r11d xorl %eax,%eax - cmpl $1970169159,%ebx + cmpl $0x756e6547,%ebx setne %al movl %eax,%r9d - cmpl $1231384169,%edx + cmpl $0x49656e69,%edx setne %al orl %eax,%r9d - cmpl $1818588270,%ecx + cmpl $0x6c65746e,%ecx setne %al orl %eax,%r9d jz .Lintel - cmpl $1752462657,%ebx + cmpl $0x68747541,%ebx setne %al movl %eax,%r10d - cmpl $1769238117,%edx + cmpl $0x69746E65,%edx setne %al orl %eax,%r10d - cmpl $1145913699,%ecx + cmpl $0x444D4163,%ecx setne %al orl %eax,%r10d jnz .Lintel - movl $2147483648,%eax + movl $0x80000000,%eax cpuid - cmpl $2147483649,%eax + cmpl $0x80000001,%eax jb .Lintel movl %eax,%r10d - movl $2147483649,%eax + movl $0x80000001,%eax cpuid orl %ecx,%r9d - andl $2049,%r9d + andl $0x00000801,%r9d - cmpl $2147483656,%r10d + cmpl $0x80000008,%r10d jb .Lintel - movl $2147483656,%eax + movl $0x80000008,%eax cpuid movzbq %cl,%r10 incq %r10 @@ -92,7 +92,7 @@ OPENSSL_ia32_cpuid: shrl $16,%ebx cmpb %r10b,%bl ja .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx jmp .Lgeneric .Lintel: @@ -105,7 +105,7 @@ OPENSSL_ia32_cpuid: cpuid movl %eax,%r10d shrl $14,%r10d - andl $4095,%r10d + andl $0xfff,%r10d cmpl $7,%r11d jb .Lnocacheinfo @@ -118,29 +118,29 @@ OPENSSL_ia32_cpuid: .Lnocacheinfo: movl $1,%eax cpuid - andl $3220176895,%edx + andl $0xbfefffff,%edx cmpl $0,%r9d jne .Lnotintel - orl $1073741824,%edx + orl $0x40000000,%edx andb $15,%ah cmpb $15,%ah jne .Lnotintel - orl $1048576,%edx + orl $0x00100000,%edx .Lnotintel: btl $28,%edx jnc .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx cmpl $0,%r10d je .Lgeneric - orl $268435456,%edx + orl $0x10000000,%edx shrl $16,%ebx cmpb $1,%bl ja .Lgeneric - andl $4026531839,%edx + andl $0xefffffff,%edx .Lgeneric: - andl $2048,%r9d - andl $4294965247,%ecx + andl $0x00000800,%r9d + andl $0xfffff7ff,%ecx orl %ecx,%r9d movl %edx,%r10d @@ -152,9 +152,9 @@ OPENSSL_ia32_cpuid: cmpl $6,%eax je .Ldone .Lclear_avx: - movl $4026525695,%eax + movl $0xefffe7ff,%eax andl %eax,%r9d - andl $4294967263,8(%rdi) + andl $0xffffffdf,8(%rdi) .Ldone: shlq $32,%r9 movl %r10d,%eax diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s index a50170a9a201f5..cb2db3584a444c 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aes-x86_64.s @@ -81,8 +81,8 @@ L$enc_loop: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $65280,%edi - andl $65280,%ebp + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp xorl %edi,%r10d xorl %ebp,%r11d @@ -94,8 +94,8 @@ L$enc_loop: movl 0(%r14,%rsi,8),%esi movl 0(%r14,%rdi,8),%edi - andl $65280,%esi - andl $65280,%edi + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi shrl $16,%ebx xorl %esi,%r12d xorl %edi,%r8d @@ -108,9 +108,9 @@ L$enc_loop: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $16711680,%edi - andl $16711680,%ebp + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp xorl %esi,%r10d xorl %edi,%r11d @@ -123,9 +123,9 @@ L$enc_loop: movl 2(%r14,%rdi,8),%edi movl 2(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $4278190080,%edi - andl $4278190080,%ebp + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp xorl %esi,%r8d xorl %edi,%r10d @@ -138,8 +138,8 @@ L$enc_loop: movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax - andl $4278190080,%esi - andl $4278190080,%edi + andl $0xff000000,%esi + andl $0xff000000,%edi xorl %esi,%r12d xorl %edi,%r8d @@ -241,8 +241,8 @@ L$enc_loop_compact: xorl %r8d,%edx cmpq 16(%rsp),%r15 je L$enc_compact_done - movl $2155905152,%r10d - movl $2155905152,%r11d + movl $0x80808080,%r10d + movl $0x80808080,%r11d andl %eax,%r10d andl %ebx,%r11d movl %r10d,%esi @@ -253,10 +253,10 @@ L$enc_loop_compact: leal (%rbx,%rbx,1),%r9d subl %r10d,%esi subl %r11d,%edi - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %eax,%r10d movl %ebx,%r11d xorl %esi,%r8d @@ -264,9 +264,9 @@ L$enc_loop_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl $2155905152,%r12d + movl $0x80808080,%r12d roll $24,%eax - movl $2155905152,%ebp + movl $0x80808080,%ebp roll $24,%ebx andl %ecx,%r12d andl %edx,%ebp @@ -289,10 +289,10 @@ L$enc_loop_compact: xorl %r10d,%eax xorl %r11d,%ebx - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %ecx,%r12d movl %edx,%ebp xorl %esi,%r8d @@ -345,7 +345,7 @@ _AES_encrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -370,7 +370,7 @@ L$enc_prologue: leaq L$AES_Te+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 call _x86_64_AES_encrypt_compact @@ -792,7 +792,7 @@ _AES_decrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -817,7 +817,7 @@ L$dec_prologue: leaq L$AES_Td+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 shrq $3,%rbp addq %rbp,%r14 @@ -1333,9 +1333,9 @@ L$cbc_picked_te: movq %r14,%r10 leaq 2304(%r14),%r11 movq %r15,%r12 - andq $4095,%r10 - andq $4095,%r11 - andq $4095,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 cmpq %r11,%r12 jb L$cbc_te_break_out @@ -1344,7 +1344,7 @@ L$cbc_picked_te: jmp L$cbc_te_ok L$cbc_te_break_out: subq %r10,%r12 - andq $4095,%r12 + andq $0xFFF,%r12 addq $320,%r12 subq %r12,%r15 .p2align 2 @@ -1370,7 +1370,7 @@ L$cbc_fast_body: movq %r15,%r10 subq %r14,%r10 - andq $4095,%r10 + andq $0xfff,%r10 cmpq $2304,%r10 jb L$cbc_do_ecopy cmpq $4096-248,%r10 @@ -1557,7 +1557,7 @@ L$cbc_slow_prologue: leaq -88-63(%rcx),%r10 subq %rbp,%r10 negq %r10 - andq $960,%r10 + andq $0x3c0,%r10 subq %r10,%rbp xchgq %rsp,%rbp @@ -1586,7 +1586,7 @@ L$cbc_slow_body: leaq 2048(%r14),%r14 leaq 768-8(%rsp),%rax subq %r14,%rax - andq $768,%rax + andq $0x300,%rax leaq (%r14,%rax,1),%r14 cmpq $0,%rbx diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s index 015db5faa7facd..970a12149bd241 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-sha1-x86_64.s @@ -1392,8 +1392,8 @@ aesni_cbc_sha1_enc_shaext: movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 jmp L$oop_shaext .p2align 4 @@ -1672,8 +1672,8 @@ L$aesenclast9: leaq 64(%rdi),%rdi jnz L$oop_shaext - pshufd $27,%xmm8,%xmm8 - pshufd $27,%xmm9,%xmm9 + pshufd $0b00011011,%xmm8,%xmm8 + pshufd $0b00011011,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s index 41ad80eebd1f89..6aa1441150a825 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/aesni-x86_64.s @@ -503,7 +503,7 @@ _aesni_ecb_encrypt: testl %r8d,%r8d jz L$ecb_decrypt - cmpq $128,%rdx + cmpq $0x80,%rdx jb L$ecb_enc_tail movdqu (%rdi),%xmm2 @@ -515,7 +515,7 @@ _aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp L$ecb_enc_loop8_enter .p2align 4 L$ecb_enc_loop8: @@ -543,7 +543,7 @@ L$ecb_enc_loop8_enter: call _aesni_encrypt8 - subq $128,%rdx + subq $0x80,%rdx jnc L$ecb_enc_loop8 movups %xmm2,(%rsi) @@ -557,22 +557,22 @@ L$ecb_enc_loop8_enter: movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz L$ecb_ret L$ecb_enc_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$ecb_enc_one movups 16(%rdi),%xmm3 je L$ecb_enc_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$ecb_enc_three movups 48(%rdi),%xmm5 je L$ecb_enc_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb L$ecb_enc_five movups 80(%rdi),%xmm7 je L$ecb_enc_six @@ -646,7 +646,7 @@ L$ecb_enc_six: .p2align 4 L$ecb_decrypt: - cmpq $128,%rdx + cmpq $0x80,%rdx jb L$ecb_dec_tail movdqu (%rdi),%xmm2 @@ -658,7 +658,7 @@ L$ecb_decrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp L$ecb_dec_loop8_enter .p2align 4 L$ecb_dec_loop8: @@ -687,7 +687,7 @@ L$ecb_dec_loop8_enter: call _aesni_decrypt8 movups (%r11),%xmm0 - subq $128,%rdx + subq $0x80,%rdx jnc L$ecb_dec_loop8 movups %xmm2,(%rsi) @@ -709,22 +709,22 @@ L$ecb_dec_loop8_enter: movups %xmm9,112(%rsi) pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz L$ecb_ret L$ecb_dec_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$ecb_dec_one movups 16(%rdi),%xmm3 je L$ecb_dec_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$ecb_dec_three movups 48(%rdi),%xmm5 je L$ecb_dec_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb L$ecb_dec_five movups 80(%rdi),%xmm7 je L$ecb_dec_six @@ -1598,7 +1598,7 @@ L$oop_enc1_8: movdqa L$xts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -1697,7 +1697,7 @@ L$xts_enc_grandloop: .byte 102,15,56,220,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 .p2align 5 L$xts_enc_loop6: @@ -1836,13 +1836,13 @@ L$xts_enc_short: jz L$xts_enc_done pxor %xmm0,%xmm11 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$xts_enc_one pxor %xmm0,%xmm12 je L$xts_enc_two pxor %xmm0,%xmm13 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$xts_enc_three pxor %xmm0,%xmm14 je L$xts_enc_four @@ -2069,7 +2069,7 @@ L$oop_enc1_11: movdqa L$xts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -2168,7 +2168,7 @@ L$xts_dec_grandloop: .byte 102,15,56,222,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 .p2align 5 L$xts_dec_loop6: @@ -2308,13 +2308,13 @@ L$xts_dec_short: jz L$xts_dec_done pxor %xmm0,%xmm12 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$xts_dec_one pxor %xmm0,%xmm13 je L$xts_dec_two pxor %xmm0,%xmm14 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$xts_dec_three je L$xts_dec_four @@ -2345,7 +2345,7 @@ L$xts_dec_short: pcmpgtd %xmm15,%xmm14 movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - pshufd $19,%xmm14,%xmm11 + pshufd $0x13,%xmm14,%xmm11 andq $15,%r9 jz L$xts_dec_ret @@ -2634,7 +2634,7 @@ L$cbc_decrypt_bulk: leaq -8(%rax),%rbp movups (%r8),%xmm10 movl %r10d,%eax - cmpq $80,%rdx + cmpq $0x50,%rdx jbe L$cbc_dec_tail movups (%rcx),%xmm0 @@ -2650,14 +2650,14 @@ L$cbc_decrypt_bulk: movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 movl _OPENSSL_ia32cap_P+4(%rip),%r9d - cmpq $112,%rdx + cmpq $0x70,%rdx jbe L$cbc_dec_six_or_seven andl $71303168,%r9d - subq $80,%rdx + subq $0x50,%rdx cmpl $4194304,%r9d je L$cbc_dec_loop6_enter - subq $32,%rdx + subq $0x20,%rdx leaq 112(%rcx),%rcx jmp L$cbc_dec_loop8_enter .p2align 4 @@ -2672,7 +2672,7 @@ L$cbc_dec_loop8_enter: movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 xorq %r11,%r11 - cmpq $112,%rdx + cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 @@ -2857,21 +2857,21 @@ L$cbc_dec_done: movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi - subq $128,%rdx + subq $0x80,%rdx ja L$cbc_dec_loop8 movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx - addq $112,%rdx + addq $0x70,%rdx jle L$cbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi - cmpq $80,%rdx + cmpq $0x50,%rdx jbe L$cbc_dec_tail movaps %xmm11,%xmm2 L$cbc_dec_six_or_seven: - cmpq $96,%rdx + cmpq $0x60,%rdx ja L$cbc_dec_seven movaps %xmm7,%xmm8 @@ -2964,33 +2964,33 @@ L$cbc_dec_loop6_enter: movl %r10d,%eax movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - subq $96,%rdx + subq $0x60,%rdx ja L$cbc_dec_loop6 movdqa %xmm7,%xmm2 - addq $80,%rdx + addq $0x50,%rdx jle L$cbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi L$cbc_dec_tail: movups (%rdi),%xmm2 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_one movups 16(%rdi),%xmm3 movaps %xmm2,%xmm11 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_two movups 32(%rdi),%xmm4 movaps %xmm3,%xmm12 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_three movups 48(%rdi),%xmm5 movaps %xmm4,%xmm13 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_four movups 64(%rdi),%xmm6 @@ -3015,7 +3015,7 @@ L$cbc_dec_tail: movdqa %xmm6,%xmm2 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 - subq $16,%rdx + subq $0x10,%rdx jmp L$cbc_dec_tail_collected .p2align 4 @@ -3332,7 +3332,7 @@ L$oop_key192: pslldq $4,%xmm0 pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0xff,%xmm0,%xmm3 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 @@ -3419,7 +3419,7 @@ L$oop_key256: decl %r10d jz L$done_key256 - pshufd $255,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 @@ -3462,11 +3462,11 @@ L$key_expansion_128: movups %xmm0,(%rax) leaq 16(%rax),%rax L$key_expansion_128_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3477,25 +3477,25 @@ L$key_expansion_192a: L$key_expansion_192a_cold: movaps %xmm2,%xmm5 L$key_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 movdqa %xmm2,%xmm3 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 pslldq $4,%xmm3 xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 + pshufd $0b01010101,%xmm1,%xmm1 pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0b11111111,%xmm0,%xmm3 pxor %xmm3,%xmm2 .byte 0xf3,0xc3 .p2align 4 L$key_expansion_192b: movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 + shufps $0b01000100,%xmm0,%xmm5 movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 + shufps $0b01001110,%xmm2,%xmm3 movups %xmm3,16(%rax) leaq 32(%rax),%rax jmp L$key_expansion_192b_warm @@ -3505,11 +3505,11 @@ L$key_expansion_256a: movups %xmm2,(%rax) leaq 16(%rax),%rax L$key_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 + shufps $0b00010000,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 + shufps $0b10001100,%xmm0,%xmm4 xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 + shufps $0b11111111,%xmm1,%xmm1 xorps %xmm1,%xmm0 .byte 0xf3,0xc3 @@ -3518,11 +3518,11 @@ L$key_expansion_256b: movups %xmm0,(%rax) leaq 16(%rax),%rax - shufps $16,%xmm2,%xmm4 + shufps $0b00010000,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 + shufps $0b10001100,%xmm2,%xmm4 xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 + shufps $0b10101010,%xmm1,%xmm1 xorps %xmm1,%xmm2 .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s index 2af36a90b05f91..52ae782e9a2e48 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/bsaes-x86_64.s @@ -324,45 +324,45 @@ L$enc_sbox: pxor %xmm2,%xmm5 decl %r10d jl L$enc_done - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm3,%xmm9 + pshufd $0x93,%xmm3,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm5,%xmm10 + pshufd $0x93,%xmm5,%xmm10 pxor %xmm9,%xmm3 - pshufd $147,%xmm2,%xmm11 + pshufd $0x93,%xmm2,%xmm11 pxor %xmm10,%xmm5 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm2 - pshufd $147,%xmm1,%xmm13 + pshufd $0x93,%xmm1,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm1 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm2,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm5,%xmm11 - pshufd $78,%xmm2,%xmm7 + pshufd $0x4E,%xmm2,%xmm7 pxor %xmm1,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm3,%xmm10 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm1,%xmm5 + pshufd $0x4E,%xmm1,%xmm5 pxor %xmm11,%xmm7 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm12,%xmm8 pxor %xmm10,%xmm2 pxor %xmm14,%xmm6 @@ -796,24 +796,24 @@ L$dec_sbox: decl %r10d jl L$dec_done - pshufd $78,%xmm15,%xmm7 - pshufd $78,%xmm2,%xmm13 + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 pxor %xmm15,%xmm7 - pshufd $78,%xmm4,%xmm14 + pshufd $0x4E,%xmm4,%xmm14 pxor %xmm2,%xmm13 - pshufd $78,%xmm0,%xmm8 + pshufd $0x4E,%xmm0,%xmm8 pxor %xmm4,%xmm14 - pshufd $78,%xmm5,%xmm9 + pshufd $0x4E,%xmm5,%xmm9 pxor %xmm0,%xmm8 - pshufd $78,%xmm3,%xmm10 + pshufd $0x4E,%xmm3,%xmm10 pxor %xmm5,%xmm9 pxor %xmm13,%xmm15 pxor %xmm13,%xmm0 - pshufd $78,%xmm1,%xmm11 + pshufd $0x4E,%xmm1,%xmm11 pxor %xmm3,%xmm10 pxor %xmm7,%xmm5 pxor %xmm8,%xmm3 - pshufd $78,%xmm6,%xmm12 + pshufd $0x4E,%xmm6,%xmm12 pxor %xmm1,%xmm11 pxor %xmm14,%xmm0 pxor %xmm9,%xmm1 @@ -827,45 +827,45 @@ L$dec_sbox: pxor %xmm14,%xmm1 pxor %xmm14,%xmm6 pxor %xmm12,%xmm4 - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm5,%xmm9 + pshufd $0x93,%xmm5,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm3,%xmm10 + pshufd $0x93,%xmm3,%xmm10 pxor %xmm9,%xmm5 - pshufd $147,%xmm1,%xmm11 + pshufd $0x93,%xmm1,%xmm11 pxor %xmm10,%xmm3 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm1 - pshufd $147,%xmm2,%xmm13 + pshufd $0x93,%xmm2,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm2 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm1,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm3,%xmm11 - pshufd $78,%xmm1,%xmm7 + pshufd $0x4E,%xmm1,%xmm7 pxor %xmm2,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm5,%xmm10 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm2,%xmm3 + pshufd $0x4E,%xmm2,%xmm3 pxor %xmm11,%xmm7 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm12,%xmm8 pxor %xmm1,%xmm10 pxor %xmm14,%xmm6 @@ -1552,20 +1552,20 @@ L$xts_enc_prologue: movdqa %xmm7,(%rax) andq $-16,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc L$xts_enc_short jmp L$xts_enc_loop .p2align 4 L$xts_enc_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1573,7 +1573,7 @@ L$xts_enc_loop: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1582,7 +1582,7 @@ L$xts_enc_loop: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1592,7 +1592,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1602,7 +1602,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1612,7 +1612,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1622,7 +1622,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -1666,20 +1666,20 @@ L$xts_enc_loop: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc L$xts_enc_loop L$xts_enc_short: - addq $128,%r14 + addq $0x80,%r14 jz L$xts_enc_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1687,7 +1687,7 @@ L$xts_enc_short: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1698,7 +1698,7 @@ L$xts_enc_short: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je L$xts_enc_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1710,7 +1710,7 @@ L$xts_enc_short: cmpq $32,%r14 je L$xts_enc_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1722,7 +1722,7 @@ L$xts_enc_short: cmpq $48,%r14 je L$xts_enc_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1734,7 +1734,7 @@ L$xts_enc_short: cmpq $64,%r14 je L$xts_enc_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1746,7 +1746,7 @@ L$xts_enc_short: cmpq $80,%r14 je L$xts_enc_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2011,20 +2011,20 @@ L$xts_dec_prologue: shlq $4,%rax subq %rax,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc L$xts_dec_short jmp L$xts_dec_loop .p2align 4 L$xts_dec_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2032,7 +2032,7 @@ L$xts_dec_loop: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2041,7 +2041,7 @@ L$xts_dec_loop: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2051,7 +2051,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2061,7 +2061,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2071,7 +2071,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2081,7 +2081,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2125,20 +2125,20 @@ L$xts_dec_loop: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc L$xts_dec_loop L$xts_dec_short: - addq $128,%r14 + addq $0x80,%r14 jz L$xts_dec_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2146,7 +2146,7 @@ L$xts_dec_short: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2157,7 +2157,7 @@ L$xts_dec_short: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je L$xts_dec_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2169,7 +2169,7 @@ L$xts_dec_short: cmpq $32,%r14 je L$xts_dec_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2181,7 +2181,7 @@ L$xts_dec_short: cmpq $48,%r14 je L$xts_dec_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2193,7 +2193,7 @@ L$xts_dec_short: cmpq $64,%r14 je L$xts_dec_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2205,7 +2205,7 @@ L$xts_dec_short: cmpq $80,%r14 je L$xts_dec_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2382,7 +2382,7 @@ L$xts_dec_done: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 movdqa %xmm6,%xmm5 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s index c724170ce99e1b..2ffd0bc1007578 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/aes/vpaes-x86_64.s @@ -60,7 +60,7 @@ L$enc_loop: addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 @@ -120,10 +120,10 @@ _vpaes_decrypt_core: pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa L$k_dipt+16(%rip),%xmm0 - xorq $48,%r11 + xorq $0x30,%r11 leaq L$k_dsbd(%rip),%r10 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 pxor %xmm5,%xmm2 movdqa L$k_mc_forward+48(%rip),%xmm5 pxor %xmm2,%xmm0 @@ -242,7 +242,7 @@ L$schedule_am_decrypting: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%rdx) - xorq $48,%r8 + xorq $0x30,%r8 L$schedule_go: cmpl $192,%esi @@ -332,7 +332,7 @@ L$oop_schedule_256: call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round @@ -399,8 +399,8 @@ L$schedule_mangle_last_dec: .p2align 4 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 @@ -437,7 +437,7 @@ _vpaes_schedule_round: pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 @@ -596,7 +596,7 @@ L$schedule_mangle_both: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 - andq $48,%r8 + andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 @@ -614,7 +614,7 @@ _vpaes_set_encrypt_key: movl %eax,240(%rdx) movl $0,%ecx - movl $48,%r8d + movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s index 4e70deabbd3f6a..b92f098e73c214 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/rsaz-x86_64.s @@ -461,48 +461,94 @@ _rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp L$mul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 + movd %r9d,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -515,14 +561,12 @@ L$mul_gather4_body: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -534,6 +578,35 @@ L$mul_gather4_body: .p2align 5 L$oop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -542,7 +615,6 @@ L$oop_mul_gather: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -551,7 +623,6 @@ L$oop_mul_gather: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -560,7 +631,6 @@ L$oop_mul_gather: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -569,7 +639,6 @@ L$oop_mul_gather: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -594,7 +663,6 @@ L$oop_mul_gather: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -602,7 +670,6 @@ L$oop_mul_gather: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -617,8 +684,8 @@ L$oop_mul_gather: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -667,7 +734,7 @@ _rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp L$mul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -703,30 +770,14 @@ L$mul_scatter4_body: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1079,16 +1130,14 @@ L$oop_mul: .p2align 4 _rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp L$oop_scatter .p2align 4 L$oop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz L$oop_scatter @@ -1099,18 +1148,72 @@ L$oop_scatter: .p2align 4 _rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp L$oop_gather .p2align 4 L$oop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz L$oop_gather .byte 0xf3,0xc3 +L$SEH_end_rsaz_512_gather4: + + +.p2align 6 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s index 040c324c49a753..c0f0b4bd6878b8 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-gf2m.s @@ -242,7 +242,7 @@ L$body_mul_2x2: movq %rcx,56(%rsp) movq %r8,64(%rsp) - movq $15,%r8 + movq $0xf,%r8 movq %rsi,%rax movq %rcx,%rbp call _mul_1x1 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s index 2ed1c0ff4233de..9b49555a4d2132 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont.s @@ -633,20 +633,20 @@ L$sqr8x_enter: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -656,58 +656,80 @@ L$sqr8x_sp_done: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) L$sqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl _OPENSSL_ia32cap_P+8(%rip),%eax - jmp L$sqr8x_copy_n - -.p2align 5 -L$sqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz L$sqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call _bn_sqr8x_internal + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz L$sqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 + pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi - jmp L$sqr8x_zero + jmp L$sqr8x_cond_copy .p2align 5 -L$sqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz L$sqr8x_zero +L$sqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz L$sqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s index bcd5140eba332f..c9731e162da51e 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/bn/x86_64-mont5.s @@ -14,46 +14,151 @@ _bn_mul_mont_gather5: L$mul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq L$inc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) L$mul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -62,29 +167,14 @@ L$mul_body: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -117,14 +207,12 @@ L$1st_enter: cmpq %r9,%r15 jne L$1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -138,33 +226,78 @@ L$1st_enter: jmp L$outer .p2align 4 L$outer: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -200,15 +333,12 @@ L$inner_enter: cmpq %r9,%r15 jne L$inner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -255,6 +385,7 @@ L$copy: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -277,10 +408,10 @@ L$mul4x_enter: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -290,19 +421,21 @@ L$mul4x_enter: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -318,6 +451,7 @@ L$mul4x_body: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -333,47 +467,141 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq L$inc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 - - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 -.byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 -.byte 0x67 - por %xmm2,%xmm0 - movq -32(%r14),%xmm2 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -387,26 +615,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -415,7 +627,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -425,7 +637,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -435,7 +647,7 @@ mul4x_internal: L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -451,7 +663,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -481,7 +693,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -490,7 +702,7 @@ L$1st4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -500,7 +712,7 @@ L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -516,7 +728,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -529,8 +741,7 @@ L$1st4x: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -541,6 +752,63 @@ L$1st4x: .p2align 5 L$outer4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -548,25 +816,11 @@ L$outer4x: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -576,7 +830,7 @@ L$outer4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -588,7 +842,7 @@ L$outer4x: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp L$inner4x @@ -597,7 +851,7 @@ L$outer4x: L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -615,7 +869,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -649,7 +903,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -660,7 +914,7 @@ L$inner4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -670,7 +924,7 @@ L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -689,7 +943,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -704,9 +958,8 @@ L$inner4x: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -717,16 +970,23 @@ L$inner4x: cmpq 16+8(%rsp),%r12 jb L$outer4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp L$sqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry .globl _bn_power5 @@ -739,9 +999,9 @@ _bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -751,19 +1011,20 @@ _bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -791,10 +1052,15 @@ L$power5_body: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1338,9 +1604,9 @@ L$sqr4x_shift_n_add: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1373,14 +1639,14 @@ L$8x_reduction_loop: .p2align 5 L$8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1389,7 +1655,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1398,7 +1664,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1407,7 +1673,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1415,7 +1681,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1423,7 +1689,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1441,7 +1707,7 @@ L$8x_reduce: decl %ecx jnz L$8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1467,14 +1733,14 @@ L$8x_reduce: L$8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1483,7 +1749,7 @@ L$8x_tail: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1491,7 +1757,7 @@ L$8x_tail: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1499,7 +1765,7 @@ L$8x_tail: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1507,7 +1773,7 @@ L$8x_tail: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1515,7 +1781,7 @@ L$8x_tail: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1533,7 +1799,7 @@ L$8x_tail: decl %ecx jnz L$8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae L$8x_tail_done @@ -1579,7 +1845,7 @@ L$8x_no_tail: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1597,40 +1863,58 @@ L$8x_no_tail: cmpq %rdx,%rdi jb L$8x_reduction_loop + .byte 0xf3,0xc3 + - subq %r15,%rcx +.p2align 5 +__bn_post4x_internal: + movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi movq %r9,%rcx - orq %rsi,%rax .byte 102,72,15,126,207 - xorq $1,%rax + negq %rax .byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp sarq $3+2,%rcx - jmp L$sqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry -.p2align 5 +.p2align 4 L$sqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz L$sqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 @@ -1656,10 +1940,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1669,19 +1952,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$from_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$from_sp_done .p2align 5 L$from_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1732,7 +2016,8 @@ L$mul_by_1: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1799,45 +2084,169 @@ L$scatter_epilogue: .globl _bn_gather5 -.p2align 4 +.p2align 5 _bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq L$magic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 +L$SEH_begin_bn_gather5: + +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq L$inc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) jmp L$gather -.p2align 4 -L$gather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 +.p2align 5 +L$gather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz L$gather + + leaq (%r10),%rsp .byte 0xf3,0xc3 L$SEH_end_bn_gather5: .p2align 6 -L$magic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s index 0a3145ad4b8969..8025d088fdab4e 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/camellia/cmll-x86_64.s @@ -1624,7 +1624,7 @@ L$cbc_prologue: leaq -64-63(%rcx),%r10 subq %rsp,%r10 negq %r10 - andq $960,%r10 + andq $0x3C0,%r10 subq %r10,%rsp diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s index a63b602b9baa97..30456b900fdffc 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/ec/ecp_nistz256-x86_64.s @@ -1121,6 +1121,7 @@ _ecp_nistz256_point_double: pushq %r15 subq $160+8,%rsp +L$point_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 @@ -1341,7 +1342,7 @@ _ecp_nistz256_point_add: por %xmm1,%xmm3 movdqu 0(%rsi),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1351,7 +1352,7 @@ _ecp_nistz256_point_add: movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1371,10 +1372,10 @@ _ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 @@ -1383,6 +1384,7 @@ _ecp_nistz256_point_add: movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi @@ -1474,7 +1476,7 @@ _ecp_nistz256_point_add: testq %r8,%r8 jnz L$add_proceedq testq %r9,%r9 - jz L$add_proceedq + jz L$add_doubleq .byte 102,72,15,126,199 pxor %xmm0,%xmm0 @@ -1486,6 +1488,13 @@ _ecp_nistz256_point_add: movdqu %xmm0,80(%rdi) jmp L$add_doneq +.p2align 5 +L$add_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp L$point_double_shortcutq + .p2align 5 L$add_proceedq: movq 0+64(%rsp),%rax @@ -1733,13 +1742,13 @@ _ecp_nistz256_point_add_affine: por %xmm1,%xmm3 movdqu 0(%rbx),%xmm0 - pshufd $177,%xmm3,%xmm5 + pshufd $0xb1,%xmm3,%xmm5 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) - pshufd $30,%xmm5,%xmm4 + pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 @@ -1755,13 +1764,13 @@ _ecp_nistz256_point_add_affine: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $177,%xmm3,%xmm4 + pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 - pshufd $30,%xmm4,%xmm3 + pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s index f21b3013c55f51..77fddf934af3dc 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/modes/ghash-x86_64.s @@ -20,14 +20,14 @@ L$gmult_prologue: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp L$oop1 .p2align 4 L$oop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -43,13 +43,13 @@ L$oop1: js L$break1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -58,19 +58,19 @@ L$oop1: .p2align 4 L$break1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -661,10 +661,10 @@ L$ghash_epilogue: _gcm_init_clmul: L$_init_clmul: movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 + pshufd $0b01001110,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 + pshufd $0b11111111,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 @@ -678,11 +678,11 @@ L$_init_clmul: pxor %xmm5,%xmm2 - pshufd $78,%xmm2,%xmm6 + pshufd $0b01001110,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -718,8 +718,8 @@ L$_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm2,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 @@ -727,7 +727,7 @@ L$_init_clmul: .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -765,7 +765,7 @@ L$_init_clmul: pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -801,8 +801,8 @@ L$_init_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm5,%xmm3 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 @@ -822,7 +822,7 @@ L$_gmult_clmul: movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 @@ -874,20 +874,20 @@ L$_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail movdqu 16(%rsi),%xmm6 movl _OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb L$skip4x andl $71303168,%eax cmpl $4194304,%eax je L$skip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -899,14 +899,14 @@ L$_ghash_clmul: .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 @@ -921,12 +921,12 @@ L$_ghash_clmul: .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 @@ -934,7 +934,7 @@ L$_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc L$tail4x jmp L$mod4_loop @@ -949,14 +949,14 @@ L$mod4_loop: movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 @@ -1000,7 +1000,7 @@ L$mod4_loop: movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 + pshufd $0b01001110,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 @@ -1010,14 +1010,14 @@ L$mod4_loop: movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 + pshufd $0b01001110,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc L$mod4_loop L$tail4x: @@ -1061,10 +1061,10 @@ L$tail4x: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz L$done movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail L$skip4x: @@ -1079,7 +1079,7 @@ L$skip4x: pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 + pshufd $0b01001110,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 @@ -1087,7 +1087,7 @@ L$skip4x: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe L$even_tail nop jmp L$mod_loop @@ -1096,7 +1096,7 @@ L$skip4x: L$mod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1134,7 +1134,7 @@ L$mod_loop: pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 + pshufd $0b01001110,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 @@ -1150,13 +1150,13 @@ L$mod_loop: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja L$mod_loop L$even_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 + pshufd $0b01001110,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 @@ -1204,7 +1204,7 @@ L$odd_tail: .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 + pshufd $0b01001110,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s index 010924530be391..a0de51655d1d3a 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-mb-x86_64.s @@ -2599,10 +2599,10 @@ L$oop_grande_shaext: punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 - pshufd $63,%xmm7,%xmm1 - pshufd $127,%xmm7,%xmm9 - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00111111,%xmm7,%xmm1 + pshufd $0b01111111,%xmm7,%xmm9 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 jmp L$oop_shaext .p2align 5 @@ -2857,8 +2857,8 @@ L$oop_shaext: .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 - pshufd $0,%xmm6,%xmm11 - pshufd $85,%xmm6,%xmm12 + pshufd $0x00,%xmm6,%xmm11 + pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 @@ -2888,8 +2888,8 @@ L$oop_shaext: movl 280(%rsp),%edx - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm8,%xmm8 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s index 671034cdafa3c4..798ca0dc4d076a 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha1-x86_64.s @@ -1240,9 +1240,9 @@ _shaext_shortcut: movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 + pshufd $0b00011011,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 @@ -1392,8 +1392,8 @@ L$oop_shaext: jnz L$oop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 + pshufd $0b00011011,%xmm0,%xmm0 + pshufd $0b00011011,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s index 5ad4c7bb100a2f..276322bec2b7e8 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-mb-x86_64.s @@ -2677,10 +2677,10 @@ L$oop_grande_shaext: punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 jmp L$oop_shaext .p2align 5 @@ -2712,11 +2712,11 @@ L$oop_shaext: movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 @@ -2734,11 +2734,11 @@ L$oop_shaext: .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2751,14 +2751,14 @@ L$oop_shaext: movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2775,13 +2775,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2797,13 +2797,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2819,13 +2819,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2841,13 +2841,13 @@ L$oop_shaext: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2863,13 +2863,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2885,13 +2885,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2907,13 +2907,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -2929,13 +2929,13 @@ L$oop_shaext: .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 @@ -2951,13 +2951,13 @@ L$oop_shaext: .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 @@ -2973,13 +2973,13 @@ L$oop_shaext: .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 @@ -2995,13 +2995,13 @@ L$oop_shaext: .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 @@ -3018,13 +3018,13 @@ L$oop_shaext: pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 @@ -3034,17 +3034,17 @@ L$oop_shaext: cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 - pshufd $0,%xmm7,%xmm9 + pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 - pshufd $85,%xmm7,%xmm10 + pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 - pshufd $14,%xmm1,%xmm0 + pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 - pshufd $14,%xmm2,%xmm0 + pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 @@ -3066,10 +3066,10 @@ L$oop_shaext: movl 280(%rsp),%edx - pshufd $27,%xmm12,%xmm12 - pshufd $27,%xmm13,%xmm13 - pshufd $27,%xmm14,%xmm14 - pshufd $27,%xmm15,%xmm15 + pshufd $0b00011011,%xmm12,%xmm12 + pshufd $0b00011011,%xmm13,%xmm13 + pshufd $0b00011011,%xmm14,%xmm14 + pshufd $0b00011011,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s index aa507cada6bb4f..5566d587610ea7 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/sha/sha256-x86_64.s @@ -1754,9 +1754,9 @@ _shaext_shortcut: movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 @@ -1775,7 +1775,7 @@ L$oop_shaext: .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 @@ -1784,7 +1784,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 @@ -1793,7 +1793,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1805,7 +1805,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1816,7 +1816,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1827,7 +1827,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1838,7 +1838,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1849,7 +1849,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1860,7 +1860,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1871,7 +1871,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop @@ -1882,7 +1882,7 @@ L$oop_shaext: paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop @@ -1893,7 +1893,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop @@ -1904,7 +1904,7 @@ L$oop_shaext: paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop @@ -1915,7 +1915,7 @@ L$oop_shaext: paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 @@ -1924,7 +1924,7 @@ L$oop_shaext: movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 @@ -1933,7 +1933,7 @@ L$oop_shaext: paddd %xmm6,%xmm0 nop .byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 + pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 @@ -1942,9 +1942,9 @@ L$oop_shaext: paddd %xmm9,%xmm1 jnz L$oop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 diff --git a/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s b/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s index 5d69baad8f4ab7..ef623d5967716c 100644 --- a/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s +++ b/deps/openssl/asm_obsolete/x64-macosx-gas/x86_64cpuid.s @@ -45,43 +45,43 @@ _OPENSSL_ia32_cpuid: movl %eax,%r11d xorl %eax,%eax - cmpl $1970169159,%ebx + cmpl $0x756e6547,%ebx setne %al movl %eax,%r9d - cmpl $1231384169,%edx + cmpl $0x49656e69,%edx setne %al orl %eax,%r9d - cmpl $1818588270,%ecx + cmpl $0x6c65746e,%ecx setne %al orl %eax,%r9d jz L$intel - cmpl $1752462657,%ebx + cmpl $0x68747541,%ebx setne %al movl %eax,%r10d - cmpl $1769238117,%edx + cmpl $0x69746E65,%edx setne %al orl %eax,%r10d - cmpl $1145913699,%ecx + cmpl $0x444D4163,%ecx setne %al orl %eax,%r10d jnz L$intel - movl $2147483648,%eax + movl $0x80000000,%eax cpuid - cmpl $2147483649,%eax + cmpl $0x80000001,%eax jb L$intel movl %eax,%r10d - movl $2147483649,%eax + movl $0x80000001,%eax cpuid orl %ecx,%r9d - andl $2049,%r9d + andl $0x00000801,%r9d - cmpl $2147483656,%r10d + cmpl $0x80000008,%r10d jb L$intel - movl $2147483656,%eax + movl $0x80000008,%eax cpuid movzbq %cl,%r10 incq %r10 @@ -93,7 +93,7 @@ _OPENSSL_ia32_cpuid: shrl $16,%ebx cmpb %r10b,%bl ja L$generic - andl $4026531839,%edx + andl $0xefffffff,%edx jmp L$generic L$intel: @@ -106,7 +106,7 @@ L$intel: cpuid movl %eax,%r10d shrl $14,%r10d - andl $4095,%r10d + andl $0xfff,%r10d cmpl $7,%r11d jb L$nocacheinfo @@ -119,29 +119,29 @@ L$intel: L$nocacheinfo: movl $1,%eax cpuid - andl $3220176895,%edx + andl $0xbfefffff,%edx cmpl $0,%r9d jne L$notintel - orl $1073741824,%edx + orl $0x40000000,%edx andb $15,%ah cmpb $15,%ah jne L$notintel - orl $1048576,%edx + orl $0x00100000,%edx L$notintel: btl $28,%edx jnc L$generic - andl $4026531839,%edx + andl $0xefffffff,%edx cmpl $0,%r10d je L$generic - orl $268435456,%edx + orl $0x10000000,%edx shrl $16,%ebx cmpb $1,%bl ja L$generic - andl $4026531839,%edx + andl $0xefffffff,%edx L$generic: - andl $2048,%r9d - andl $4294965247,%ecx + andl $0x00000800,%r9d + andl $0xfffff7ff,%ecx orl %ecx,%r9d movl %edx,%r10d @@ -153,9 +153,9 @@ L$generic: cmpl $6,%eax je L$done L$clear_avx: - movl $4026525695,%eax + movl $0xefffe7ff,%eax andl %eax,%r9d - andl $4294967263,8(%rdi) + andl $0xffffffdf,8(%rdi) L$done: shlq $32,%r9 movl %r10d,%eax diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm index 86e828d3dc651b..89114311a56e7b 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/rsaz-x86_64.asm @@ -502,48 +502,104 @@ $L$SEH_begin_rsaz_512_mul_gather4:: push r14 push r15 - mov r9d,r9d - sub rsp,128+24 + sub rsp,328 + movaps XMMWORD PTR[160+rsp],xmm6 + movaps XMMWORD PTR[176+rsp],xmm7 + movaps XMMWORD PTR[192+rsp],xmm8 + movaps XMMWORD PTR[208+rsp],xmm9 + movaps XMMWORD PTR[224+rsp],xmm10 + movaps XMMWORD PTR[240+rsp],xmm11 + movaps XMMWORD PTR[256+rsp],xmm12 + movaps XMMWORD PTR[272+rsp],xmm13 + movaps XMMWORD PTR[288+rsp],xmm14 + movaps XMMWORD PTR[304+rsp],xmm15 $L$mul_gather4_body:: - mov eax,DWORD PTR[64+r9*4+rdx] -DB 102,72,15,110,199 - mov ebx,DWORD PTR[r9*4+rdx] -DB 102,72,15,110,201 + movd xmm8,r9d + movdqa xmm1,XMMWORD PTR[(($L$inc+16))] + movdqa xmm0,XMMWORD PTR[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 + + movdqa xmm8,XMMWORD PTR[rdx] + movdqa xmm9,XMMWORD PTR[16+rdx] + movdqa xmm10,XMMWORD PTR[32+rdx] + movdqa xmm11,XMMWORD PTR[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rdx] + lea rbp,QWORD PTR[128+rdx] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 +DB 102,76,15,126,195 + mov QWORD PTR[128+rsp],r8 + mov QWORD PTR[((128+8))+rsp],rdi + mov QWORD PTR[((128+16))+rsp],rcx - shl rax,32 - or rbx,rax mov rax,QWORD PTR[rsi] mov rcx,QWORD PTR[8+rsi] - lea rbp,QWORD PTR[128+r9*4+rdx] mul rbx mov QWORD PTR[rsp],rax mov rax,rcx mov r8,rdx mul rbx - movd xmm4,DWORD PTR[rbp] add r8,rax mov rax,QWORD PTR[16+rsi] mov r9,rdx adc r9,0 mul rbx - movd xmm5,DWORD PTR[64+rbp] add r9,rax mov rax,QWORD PTR[24+rsi] mov r10,rdx adc r10,0 mul rbx - pslldq xmm5,4 add r10,rax mov rax,QWORD PTR[32+rsi] mov r11,rdx adc r11,0 mul rbx - por xmm4,xmm5 add r11,rax mov rax,QWORD PTR[40+rsi] mov r12,rdx @@ -556,14 +612,12 @@ DB 102,72,15,110,201 adc r13,0 mul rbx - lea rbp,QWORD PTR[128+rbp] add r13,rax mov rax,QWORD PTR[56+rsi] mov r14,rdx adc r14,0 mul rbx -DB 102,72,15,126,227 add r14,rax mov rax,QWORD PTR[rsi] mov r15,rdx @@ -575,6 +629,35 @@ DB 102,72,15,126,227 ALIGN 32 $L$oop_mul_gather:: + movdqa xmm8,XMMWORD PTR[rbp] + movdqa xmm9,XMMWORD PTR[16+rbp] + movdqa xmm10,XMMWORD PTR[32+rbp] + movdqa xmm11,XMMWORD PTR[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rbp] + lea rbp,QWORD PTR[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 +DB 102,76,15,126,195 + mul rbx add r8,rax mov rax,QWORD PTR[8+rsi] @@ -583,7 +666,6 @@ $L$oop_mul_gather:: adc r8,0 mul rbx - movd xmm4,DWORD PTR[rbp] add r9,rax mov rax,QWORD PTR[16+rsi] adc rdx,0 @@ -592,7 +674,6 @@ $L$oop_mul_gather:: adc r9,0 mul rbx - movd xmm5,DWORD PTR[64+rbp] add r10,rax mov rax,QWORD PTR[24+rsi] adc rdx,0 @@ -601,7 +682,6 @@ $L$oop_mul_gather:: adc r10,0 mul rbx - pslldq xmm5,4 add r11,rax mov rax,QWORD PTR[32+rsi] adc rdx,0 @@ -610,7 +690,6 @@ $L$oop_mul_gather:: adc r11,0 mul rbx - por xmm4,xmm5 add r12,rax mov rax,QWORD PTR[40+rsi] adc rdx,0 @@ -635,7 +714,6 @@ $L$oop_mul_gather:: adc r14,0 mul rbx -DB 102,72,15,126,227 add r15,rax mov rax,QWORD PTR[rsi] adc rdx,0 @@ -643,7 +721,6 @@ DB 102,72,15,126,227 mov r15,rdx adc r15,0 - lea rbp,QWORD PTR[128+rbp] lea rdi,QWORD PTR[8+rdi] dec ecx @@ -658,8 +735,8 @@ DB 102,72,15,126,227 mov QWORD PTR[48+rdi],r14 mov QWORD PTR[56+rdi],r15 -DB 102,72,15,126,199 -DB 102,72,15,126,205 + mov rdi,QWORD PTR[((128+8))+rsp] + mov rbp,QWORD PTR[((128+16))+rsp] mov r8,QWORD PTR[rsp] mov r9,QWORD PTR[8+rsp] @@ -684,6 +761,17 @@ DB 102,72,15,126,205 call __rsaz_512_subtract lea rax,QWORD PTR[((128+24+48))+rsp] + movaps xmm6,XMMWORD PTR[((160-200))+rax] + movaps xmm7,XMMWORD PTR[((176-200))+rax] + movaps xmm8,XMMWORD PTR[((192-200))+rax] + movaps xmm9,XMMWORD PTR[((208-200))+rax] + movaps xmm10,XMMWORD PTR[((224-200))+rax] + movaps xmm11,XMMWORD PTR[((240-200))+rax] + movaps xmm12,XMMWORD PTR[((256-200))+rax] + movaps xmm13,XMMWORD PTR[((272-200))+rax] + movaps xmm14,XMMWORD PTR[((288-200))+rax] + movaps xmm15,XMMWORD PTR[((304-200))+rax] + lea rax,QWORD PTR[176+rax] mov r15,QWORD PTR[((-48))+rax] mov r14,QWORD PTR[((-40))+rax] mov r13,QWORD PTR[((-32))+rax] @@ -723,7 +811,7 @@ $L$SEH_begin_rsaz_512_mul_scatter4:: mov r9d,r9d sub rsp,128+24 $L$mul_scatter4_body:: - lea r8,QWORD PTR[r9*4+r8] + lea r8,QWORD PTR[r9*8+r8] DB 102,72,15,110,199 DB 102,72,15,110,202 DB 102,73,15,110,208 @@ -759,30 +847,14 @@ DB 102,72,15,126,214 call __rsaz_512_subtract - mov DWORD PTR[rsi],r8d - shr r8,32 - mov DWORD PTR[128+rsi],r9d - shr r9,32 - mov DWORD PTR[256+rsi],r10d - shr r10,32 - mov DWORD PTR[384+rsi],r11d - shr r11,32 - mov DWORD PTR[512+rsi],r12d - shr r12,32 - mov DWORD PTR[640+rsi],r13d - shr r13,32 - mov DWORD PTR[768+rsi],r14d - shr r14,32 - mov DWORD PTR[896+rsi],r15d - shr r15,32 - mov DWORD PTR[64+rsi],r8d - mov DWORD PTR[192+rsi],r9d - mov DWORD PTR[320+rsi],r10d - mov DWORD PTR[448+rsi],r11d - mov DWORD PTR[576+rsi],r12d - mov DWORD PTR[704+rsi],r13d - mov DWORD PTR[832+rsi],r14d - mov DWORD PTR[960+rsi],r15d + mov QWORD PTR[rsi],r8 + mov QWORD PTR[128+rsi],r9 + mov QWORD PTR[256+rsi],r10 + mov QWORD PTR[384+rsi],r11 + mov QWORD PTR[512+rsi],r12 + mov QWORD PTR[640+rsi],r13 + mov QWORD PTR[768+rsi],r14 + mov QWORD PTR[896+rsi],r15 lea rax,QWORD PTR[((128+24+48))+rsp] mov r15,QWORD PTR[((-48))+rax] @@ -1151,16 +1223,14 @@ PUBLIC rsaz_512_scatter4 ALIGN 16 rsaz_512_scatter4 PROC PUBLIC - lea rcx,QWORD PTR[r8*4+rcx] + lea rcx,QWORD PTR[r8*8+rcx] mov r9d,8 jmp $L$oop_scatter ALIGN 16 $L$oop_scatter:: mov rax,QWORD PTR[rdx] lea rdx,QWORD PTR[8+rdx] - mov DWORD PTR[rcx],eax - shr rax,32 - mov DWORD PTR[64+rcx],eax + mov QWORD PTR[rcx],rax lea rcx,QWORD PTR[128+rcx] dec r9d jnz $L$oop_scatter @@ -1171,22 +1241,98 @@ PUBLIC rsaz_512_gather4 ALIGN 16 rsaz_512_gather4 PROC PUBLIC - lea rdx,QWORD PTR[r8*4+rdx] +$L$SEH_begin_rsaz_512_gather4:: +DB 048h,081h,0ech,0a8h,000h,000h,000h +DB 00fh,029h,034h,024h +DB 00fh,029h,07ch,024h,010h +DB 044h,00fh,029h,044h,024h,020h +DB 044h,00fh,029h,04ch,024h,030h +DB 044h,00fh,029h,054h,024h,040h +DB 044h,00fh,029h,05ch,024h,050h +DB 044h,00fh,029h,064h,024h,060h +DB 044h,00fh,029h,06ch,024h,070h +DB 044h,00fh,029h,0b4h,024h,080h,0,0,0 +DB 044h,00fh,029h,0bch,024h,090h,0,0,0 + movd xmm8,r8d + movdqa xmm1,XMMWORD PTR[(($L$inc+16))] + movdqa xmm0,XMMWORD PTR[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 mov r9d,8 jmp $L$oop_gather ALIGN 16 $L$oop_gather:: - mov eax,DWORD PTR[rdx] - mov r8d,DWORD PTR[64+rdx] + movdqa xmm8,XMMWORD PTR[rdx] + movdqa xmm9,XMMWORD PTR[16+rdx] + movdqa xmm10,XMMWORD PTR[32+rdx] + movdqa xmm11,XMMWORD PTR[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD PTR[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD PTR[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD PTR[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD PTR[112+rdx] lea rdx,QWORD PTR[128+rdx] - shl r8,32 - or rax,r8 - mov QWORD PTR[rcx],rax + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,04eh + por xmm8,xmm9 + movq QWORD PTR[rcx],xmm8 lea rcx,QWORD PTR[8+rcx] dec r9d jnz $L$oop_gather + movaps xmm6,XMMWORD PTR[rsp] + movaps xmm7,XMMWORD PTR[16+rsp] + movaps xmm8,XMMWORD PTR[32+rsp] + movaps xmm9,XMMWORD PTR[48+rsp] + movaps xmm10,XMMWORD PTR[64+rsp] + movaps xmm11,XMMWORD PTR[80+rsp] + movaps xmm12,XMMWORD PTR[96+rsp] + movaps xmm13,XMMWORD PTR[112+rsp] + movaps xmm14,XMMWORD PTR[128+rsp] + movaps xmm15,XMMWORD PTR[144+rsp] + add rsp,0a8h DB 0F3h,0C3h ;repret +$L$SEH_end_rsaz_512_gather4:: rsaz_512_gather4 ENDP + +ALIGN 64 +$L$inc:: + DD 0,0,1,1 + DD 2,2,2,2 EXTERN __imp_RtlVirtualUnwind:NEAR ALIGN 16 @@ -1222,6 +1368,18 @@ se_handler PROC PRIVATE lea rax,QWORD PTR[((128+24+48))+rax] + lea rbx,QWORD PTR[$L$mul_gather4_epilogue] + cmp rbx,r10 + jne $L$se_not_in_mul_gather4 + + lea rax,QWORD PTR[176+rax] + + lea rsi,QWORD PTR[((-48-168))+rax] + lea rdi,QWORD PTR[512+r8] + mov ecx,20 + DD 0a548f3fch + +$L$se_not_in_mul_gather4:: mov rbx,QWORD PTR[((-8))+rax] mov rbp,QWORD PTR[((-16))+rax] mov r12,QWORD PTR[((-24))+rax] @@ -1298,6 +1456,10 @@ ALIGN 4 DD imagerel $L$SEH_end_rsaz_512_mul_by_one DD imagerel $L$SEH_info_rsaz_512_mul_by_one + DD imagerel $L$SEH_begin_rsaz_512_gather4 + DD imagerel $L$SEH_end_rsaz_512_gather4 + DD imagerel $L$SEH_info_rsaz_512_gather4 + .pdata ENDS .xdata SEGMENT READONLY ALIGN(8) ALIGN 8 @@ -1321,6 +1483,19 @@ $L$SEH_info_rsaz_512_mul_by_one:: DB 9,0,0,0 DD imagerel se_handler DD imagerel $L$mul_by_one_body,imagerel $L$mul_by_one_epilogue +$L$SEH_info_rsaz_512_gather4:: +DB 001h,046h,016h,000h +DB 046h,0f8h,009h,000h +DB 03dh,0e8h,008h,000h +DB 034h,0d8h,007h,000h +DB 02eh,0c8h,006h,000h +DB 028h,0b8h,005h,000h +DB 022h,0a8h,004h,000h +DB 01ch,098h,003h,000h +DB 016h,088h,002h,000h +DB 010h,078h,001h,000h +DB 00bh,068h,000h,000h +DB 007h,001h,015h,000h .xdata ENDS END diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm index afec83bd17596c..e24eb89aeee6f2 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont.asm @@ -676,20 +676,20 @@ $L$sqr8x_enter:: - lea r11,QWORD PTR[((-64))+r9*4+rsp] + lea r11,QWORD PTR[((-64))+r9*2+rsp] mov r8,QWORD PTR[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb $L$sqr8x_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*4+rsp] + lea rsp,QWORD PTR[((-64))+r9*2+rsp] jmp $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*4] - lea rsp,QWORD PTR[((-64))+r9*4+rsp] + lea r10,QWORD PTR[((4096-64))+r9*2] + lea rsp,QWORD PTR[((-64))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -699,58 +699,80 @@ $L$sqr8x_sp_done:: mov r10,r9 neg r9 - lea r11,QWORD PTR[64+r9*2+rsp] mov QWORD PTR[32+rsp],r8 mov QWORD PTR[40+rsp],rax $L$sqr8x_body:: - mov rbp,r9 -DB 102,73,15,110,211 - shr rbp,3+2 - mov eax,DWORD PTR[((OPENSSL_ia32cap_P+8))] - jmp $L$sqr8x_copy_n - -ALIGN 32 -$L$sqr8x_copy_n:: - movq xmm0,QWORD PTR[rcx] - movq xmm1,QWORD PTR[8+rcx] - movq xmm3,QWORD PTR[16+rcx] - movq xmm4,QWORD PTR[24+rcx] - lea rcx,QWORD PTR[32+rcx] - movdqa XMMWORD PTR[r11],xmm0 - movdqa XMMWORD PTR[16+r11],xmm1 - movdqa XMMWORD PTR[32+r11],xmm3 - movdqa XMMWORD PTR[48+r11],xmm4 - lea r11,QWORD PTR[64+r11] - dec rbp - jnz $L$sqr8x_copy_n - +DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 call bn_sqr8x_internal + + + + lea rbx,QWORD PTR[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 +DB 102,72,15,126,207 + sar rcx,3+2 + jmp $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_sub:: + mov r12,QWORD PTR[rbx] + mov r13,QWORD PTR[8+rbx] + mov r14,QWORD PTR[16+rbx] + mov r15,QWORD PTR[24+rbx] + lea rbx,QWORD PTR[32+rbx] + sbb r12,QWORD PTR[rbp] + sbb r13,QWORD PTR[8+rbp] + sbb r14,QWORD PTR[16+rbp] + sbb r15,QWORD PTR[24+rbp] + lea rbp,QWORD PTR[32+rbp] + mov QWORD PTR[rdi],r12 + mov QWORD PTR[8+rdi],r13 + mov QWORD PTR[16+rdi],r14 + mov QWORD PTR[24+rdi],r15 + lea rdi,QWORD PTR[32+rdi] + inc rcx + jnz $L$sqr8x_sub + + sbb rax,0 + lea rbx,QWORD PTR[r9*1+rbx] + lea rdi,QWORD PTR[r9*1+rdi] + +DB 102,72,15,110,200 pxor xmm0,xmm0 - lea rax,QWORD PTR[48+rsp] - lea rdx,QWORD PTR[64+r9*2+rsp] - shr r9,3+2 + pshufd xmm1,xmm1,0 mov rsi,QWORD PTR[40+rsp] - jmp $L$sqr8x_zero + jmp $L$sqr8x_cond_copy ALIGN 32 -$L$sqr8x_zero:: - movdqa XMMWORD PTR[rax],xmm0 - movdqa XMMWORD PTR[16+rax],xmm0 - movdqa XMMWORD PTR[32+rax],xmm0 - movdqa XMMWORD PTR[48+rax],xmm0 - lea rax,QWORD PTR[64+rax] - movdqa XMMWORD PTR[rdx],xmm0 - movdqa XMMWORD PTR[16+rdx],xmm0 - movdqa XMMWORD PTR[32+rdx],xmm0 - movdqa XMMWORD PTR[48+rdx],xmm0 - lea rdx,QWORD PTR[64+rdx] - dec r9 - jnz $L$sqr8x_zero +$L$sqr8x_cond_copy:: + movdqa xmm2,XMMWORD PTR[rbx] + movdqa xmm3,XMMWORD PTR[16+rbx] + lea rbx,QWORD PTR[32+rbx] + movdqu xmm4,XMMWORD PTR[rdi] + movdqu xmm5,XMMWORD PTR[16+rdi] + lea rdi,QWORD PTR[32+rdi] + movdqa XMMWORD PTR[(-32)+rbx],xmm0 + movdqa XMMWORD PTR[(-16)+rbx],xmm0 + movdqa XMMWORD PTR[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD PTR[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD PTR[(-32)+rdi],xmm4 + movdqu XMMWORD PTR[(-16)+rdi],xmm5 + add r9,32 + jnz $L$sqr8x_cond_copy mov rax,1 mov r15,QWORD PTR[((-48))+rsi] diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm index 86acef32eacdb9..503e2d6a038ee6 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/bn/x86_64-mont5.asm @@ -27,49 +27,151 @@ ALIGN 16 $L$mul_enter:: mov r9d,r9d mov rax,rsp - mov r10d,DWORD PTR[56+rsp] + movd xmm5,DWORD PTR[56+rsp] + lea r10,QWORD PTR[$L$inc] push rbx push rbp push r12 push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 + lea r11,QWORD PTR[2+r9] neg r11 - lea rsp,QWORD PTR[r11*8+rsp] + lea rsp,QWORD PTR[((-264))+r11*8+rsp] and rsp,-1024 mov QWORD PTR[8+r9*8+rsp],rax $L$mul_body:: - mov r12,rdx - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,QWORD PTR[$L$magic_masks] - and r10,3 - lea r12,QWORD PTR[96+r11*8+r12] - movq xmm4,QWORD PTR[r10*8+rax] - movq xmm5,QWORD PTR[8+r10*8+rax] - movq xmm6,QWORD PTR[16+r10*8+rax] - movq xmm7,QWORD PTR[24+r10*8+rax] - - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 + lea r12,QWORD PTR[128+rdx] + movdqa xmm0,XMMWORD PTR[r10] + movdqa xmm1,XMMWORD PTR[16+r10] + lea r10,QWORD PTR[((24-112))+r9*8+rsp] + and r10,-16 + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 +DB 067h + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[304+r10],xmm0 + + paddd xmm3,xmm2 +DB 067h + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[336+r10],xmm2 + pand xmm0,XMMWORD PTR[64+r12] + + pand xmm1,XMMWORD PTR[80+r12] + pand xmm2,XMMWORD PTR[96+r12] + movdqa XMMWORD PTR[352+r10],xmm3 + pand xmm3,XMMWORD PTR[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-128))+r12] + movdqa xmm5,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + pand xmm4,XMMWORD PTR[112+r10] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm5,XMMWORD PTR[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[160+r10] por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-64))+r12] + movdqa xmm5,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + pand xmm4,XMMWORD PTR[176+r10] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm5,XMMWORD PTR[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[r12] + movdqa xmm5,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + pand xmm4,XMMWORD PTR[240+r10] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm5,XMMWORD PTR[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - DB 102,72,15,126,195 mov r8,QWORD PTR[r8] @@ -78,29 +180,14 @@ DB 102,72,15,126,195 xor r14,r14 xor r15,r15 - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD PTR[rcx] - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD PTR[8+rsi] @@ -133,14 +220,12 @@ $L$1st_enter:: cmp r15,r9 jne $L$1st -DB 102,72,15,126,195 add r13,rax - mov rax,QWORD PTR[rsi] adc rdx,0 add r13,r11 adc rdx,0 - mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov QWORD PTR[((-16))+r9*8+rsp],r13 mov r13,rdx mov r11,r10 @@ -154,33 +239,78 @@ DB 102,72,15,126,195 jmp $L$outer ALIGN 16 $L$outer:: + lea rdx,QWORD PTR[((24+128))+r9*8+rsp] + and rdx,-16 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r12] + movdqa xmm1,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm0,XMMWORD PTR[((-128))+rdx] + pand xmm1,XMMWORD PTR[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r12] + movdqa xmm1,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm0,XMMWORD PTR[((-64))+rdx] + pand xmm1,XMMWORD PTR[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r12] + movdqa xmm1,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm0,XMMWORD PTR[rdx] + pand xmm1,XMMWORD PTR[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r12] + movdqa xmm1,XMMWORD PTR[80+r12] + movdqa xmm2,XMMWORD PTR[96+r12] + movdqa xmm3,XMMWORD PTR[112+r12] + pand xmm0,XMMWORD PTR[64+rdx] + pand xmm1,XMMWORD PTR[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 + lea r12,QWORD PTR[256+r12] + + mov rax,QWORD PTR[rsi] +DB 102,72,15,126,195 + xor r15,r15 mov rbp,r8 mov r10,QWORD PTR[rsp] - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - mul rbx add r10,rax mov rax,QWORD PTR[rcx] adc rdx,0 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD PTR[8+rsi] @@ -216,15 +346,12 @@ $L$inner_enter:: cmp r15,r9 jne $L$inner -DB 102,72,15,126,195 - add r13,rax - mov rax,QWORD PTR[rsi] adc rdx,0 add r13,r10 - mov r10,QWORD PTR[r15*8+rsp] + mov r10,QWORD PTR[r9*8+rsp] adc rdx,0 - mov QWORD PTR[((-16))+r15*8+rsp],r13 + mov QWORD PTR[((-16))+r9*8+rsp],r13 mov r13,rdx xor rdx,rdx @@ -271,8 +398,7 @@ $L$copy:: mov rsi,QWORD PTR[8+r9*8+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -310,13 +436,10 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 + DB 067h - mov r10d,r9d shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 @@ -326,19 +449,21 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$mul4xsp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -354,8 +479,7 @@ $L$mul4x_body:: mov rsi,QWORD PTR[40+rsp] mov rax,1 - movaps xmm6,XMMWORD PTR[((-88))+rsi] - movaps xmm7,XMMWORD PTR[((-72))+rsi] + mov r15,QWORD PTR[((-48))+rsi] mov r14,QWORD PTR[((-40))+rsi] mov r13,QWORD PTR[((-32))+rsi] @@ -374,47 +498,141 @@ bn_mul4x_mont_gather5 ENDP ALIGN 32 mul4x_internal PROC PRIVATE shl r9,5 - mov r10d,DWORD PTR[56+rax] - lea r13,QWORD PTR[256+r9*1+rdx] + movd xmm5,DWORD PTR[56+rax] + lea rax,QWORD PTR[$L$inc] + lea r13,QWORD PTR[128+r9*1+rdx] shr r9,5 - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,QWORD PTR[$L$magic_masks] - and r10,3 - lea r12,QWORD PTR[96+r11*8+rdx] - movq xmm4,QWORD PTR[r10*8+rax] - movq xmm5,QWORD PTR[8+r10*8+rax] - add r11,7 - movq xmm6,QWORD PTR[16+r10*8+rax] - movq xmm7,QWORD PTR[24+r10*8+rax] - and r11,7 - - movq xmm0,QWORD PTR[((-96))+r12] - lea r14,QWORD PTR[256+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - pand xmm2,xmm6 -DB 067h - por xmm0,xmm1 - movq xmm1,QWORD PTR[((-96))+r14] -DB 067h - pand xmm3,xmm7 -DB 067h - por xmm0,xmm2 - movq xmm2,QWORD PTR[((-32))+r14] + movdqa xmm0,XMMWORD PTR[rax] + movdqa xmm1,XMMWORD PTR[16+rax] + lea r10,QWORD PTR[((88-112))+r9*1+rsp] + lea r12,QWORD PTR[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 067h,067h + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 DB 067h - pand xmm1,xmm4 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[304+r10],xmm0 + + paddd xmm3,xmm2 DB 067h - por xmm0,xmm3 - movq xmm3,QWORD PTR[32+r14] + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[336+r10],xmm2 + pand xmm0,XMMWORD PTR[64+r12] + pand xmm1,XMMWORD PTR[80+r12] + pand xmm2,XMMWORD PTR[96+r12] + movdqa XMMWORD PTR[352+r10],xmm3 + pand xmm3,XMMWORD PTR[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-128))+r12] + movdqa xmm5,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + pand xmm4,XMMWORD PTR[112+r10] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm5,XMMWORD PTR[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[((-64))+r12] + movdqa xmm5,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + pand xmm4,XMMWORD PTR[176+r10] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm5,XMMWORD PTR[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD PTR[r12] + movdqa xmm5,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + pand xmm4,XMMWORD PTR[240+r10] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm5,XMMWORD PTR[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD PTR[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD PTR[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 + lea r12,QWORD PTR[256+r12] DB 102,72,15,126,195 - movq xmm0,QWORD PTR[96+r14] + mov QWORD PTR[((16+8))+rsp],r13 mov QWORD PTR[((56+8))+rsp],rdi @@ -428,26 +646,10 @@ DB 102,72,15,126,195 mov r10,rax mov rax,QWORD PTR[rcx] - pand xmm2,xmm5 - pand xmm3,xmm6 - por xmm1,xmm2 - imul rbp,r10 - - - - - - - - lea r14,QWORD PTR[((64+8))+r11*8+rsp] + lea r14,QWORD PTR[((64+8))+rsp] mov r11,rdx - pand xmm0,xmm7 - por xmm1,xmm3 - lea r12,QWORD PTR[512+r12] - por xmm0,xmm1 - mul rbp add r10,rax mov rax,QWORD PTR[8+r9*1+rsi] @@ -456,7 +658,7 @@ DB 102,72,15,126,195 mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 mov r10,rdx @@ -466,7 +668,7 @@ DB 102,72,15,126,195 adc rdx,0 add rdi,r11 lea r15,QWORD PTR[32+r9] - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[r14],rdi mov r13,rdx @@ -476,7 +678,7 @@ ALIGN 32 $L$1st4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] lea r14,QWORD PTR[32+r14] adc rdx,0 mov r11,rdx @@ -492,7 +694,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -522,7 +724,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 mov r10,rdx @@ -531,7 +733,7 @@ $L$1st4x:: mov rax,QWORD PTR[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[r14],rdi mov r13,rdx @@ -541,7 +743,7 @@ $L$1st4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] lea r14,QWORD PTR[32+r14] adc rdx,0 mov r11,rdx @@ -557,7 +759,7 @@ $L$1st4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -570,8 +772,7 @@ $L$1st4x:: mov QWORD PTR[((-16))+r14],rdi mov r13,rdx -DB 102,72,15,126,195 - lea rcx,QWORD PTR[r9*2+rcx] + lea rcx,QWORD PTR[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -582,6 +783,63 @@ DB 102,72,15,126,195 ALIGN 32 $L$outer4x:: + lea rdx,QWORD PTR[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r12] + movdqa xmm1,XMMWORD PTR[((-112))+r12] + movdqa xmm2,XMMWORD PTR[((-96))+r12] + movdqa xmm3,XMMWORD PTR[((-80))+r12] + pand xmm0,XMMWORD PTR[((-128))+rdx] + pand xmm1,XMMWORD PTR[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r12] + movdqa xmm1,XMMWORD PTR[((-48))+r12] + movdqa xmm2,XMMWORD PTR[((-32))+r12] + movdqa xmm3,XMMWORD PTR[((-16))+r12] + pand xmm0,XMMWORD PTR[((-64))+rdx] + pand xmm1,XMMWORD PTR[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r12] + movdqa xmm1,XMMWORD PTR[16+r12] + movdqa xmm2,XMMWORD PTR[32+r12] + movdqa xmm3,XMMWORD PTR[48+r12] + pand xmm0,XMMWORD PTR[rdx] + pand xmm1,XMMWORD PTR[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r12] + movdqa xmm1,XMMWORD PTR[80+r12] + movdqa xmm2,XMMWORD PTR[96+r12] + movdqa xmm3,XMMWORD PTR[112+r12] + pand xmm0,XMMWORD PTR[64+rdx] + pand xmm1,XMMWORD PTR[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 + lea r12,QWORD PTR[256+r12] +DB 102,72,15,126,195 + mov r10,QWORD PTR[r9*1+r14] mov rbp,r8 mul rbx @@ -589,25 +847,11 @@ $L$outer4x:: mov rax,QWORD PTR[rcx] adc rdx,0 - movq xmm0,QWORD PTR[((-96))+r12] - movq xmm1,QWORD PTR[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[96+r12] - imul rbp,r10 -DB 067h mov r11,rdx mov QWORD PTR[r14],rdi - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - por xmm0,xmm2 lea r14,QWORD PTR[r9*1+r14] - lea r12,QWORD PTR[256+r12] - por xmm0,xmm3 mul rbp add r10,rax @@ -617,7 +861,7 @@ DB 067h mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 add r11,QWORD PTR[8+r14] adc rdx,0 @@ -629,7 +873,7 @@ DB 067h adc rdx,0 add rdi,r11 lea r15,QWORD PTR[32+r9] - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov r13,rdx jmp $L$inner4x @@ -638,7 +882,7 @@ ALIGN 32 $L$inner4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] adc rdx,0 add r10,QWORD PTR[16+r14] lea r14,QWORD PTR[32+r14] @@ -656,7 +900,7 @@ $L$inner4x:: mul rbx add r11,rax - mov rax,QWORD PTR[((-16))+rcx] + mov rax,QWORD PTR[((-8))+rcx] adc rdx,0 add r11,QWORD PTR[((-8))+r14] adc rdx,0 @@ -690,7 +934,7 @@ $L$inner4x:: mul rbx add r11,rax - mov rax,QWORD PTR[16+rcx] + mov rax,QWORD PTR[8+rcx] adc rdx,0 add r11,QWORD PTR[8+r14] adc rdx,0 @@ -701,7 +945,7 @@ $L$inner4x:: mov rax,QWORD PTR[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,QWORD PTR[64+rcx] + lea rcx,QWORD PTR[32+rcx] adc rdx,0 mov QWORD PTR[((-8))+r14],r13 mov r13,rdx @@ -711,7 +955,7 @@ $L$inner4x:: mul rbx add r10,rax - mov rax,QWORD PTR[((-32))+rcx] + mov rax,QWORD PTR[((-16))+rcx] adc rdx,0 add r10,QWORD PTR[16+r14] lea r14,QWORD PTR[32+r14] @@ -730,7 +974,7 @@ $L$inner4x:: mul rbx add r11,rax mov rax,rbp - mov rbp,QWORD PTR[((-16))+rcx] + mov rbp,QWORD PTR[((-8))+rcx] adc rdx,0 add r11,QWORD PTR[((-8))+r14] adc rdx,0 @@ -745,9 +989,8 @@ $L$inner4x:: mov QWORD PTR[((-24))+r14],r13 mov r13,rdx -DB 102,72,15,126,195 mov QWORD PTR[((-16))+r14],rdi - lea rcx,QWORD PTR[r9*2+rcx] + lea rcx,QWORD PTR[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -758,16 +1001,23 @@ DB 102,72,15,126,195 cmp r12,QWORD PTR[((16+8))+rsp] jb $L$outer4x + xor rax,rax sub rbp,r13 adc r15,r15 or rdi,r15 - xor rdi,1 + sub rax,rdi lea rbx,QWORD PTR[r9*1+r14] - lea rbp,QWORD PTR[rdi*8+rcx] + mov r12,QWORD PTR[rcx] + lea rbp,QWORD PTR[rcx] mov rcx,r9 sar rcx,3+2 mov rdi,QWORD PTR[((56+8))+rsp] - jmp $L$sqr4x_sub + dec r12 + xor r10,r10 + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqr4x_sub_entry mul4x_internal ENDP PUBLIC bn_power5 @@ -792,12 +1042,9 @@ $L$SEH_begin_bn_power5:: push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10d,DWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -807,19 +1054,20 @@ $L$SEH_begin_bn_power5:: - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$pwr_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -847,10 +1095,15 @@ DB 102,73,15,110,218 DB 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal DB 102,72,15,126,209 DB 102,72,15,126,226 @@ -1397,9 +1650,9 @@ DB 067h mov QWORD PTR[((-16))+rdi],rbx mov QWORD PTR[((-8))+rdi],r8 DB 102,72,15,126,213 -sqr8x_reduction:: +__bn_sqr8x_reduction:: xor rax,rax - lea rcx,QWORD PTR[r9*2+rbp] + lea rcx,QWORD PTR[rbp*1+r9] lea rdx,QWORD PTR[((48+8))+r9*2+rsp] mov QWORD PTR[((0+8))+rsp],rcx lea rdi,QWORD PTR[((48+8))+r9*1+rsp] @@ -1432,14 +1685,14 @@ DB 067h ALIGN 32 $L$8x_reduce:: mul rbx - mov rax,QWORD PTR[16+rbp] + mov rax,QWORD PTR[8+rbp] neg r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD PTR[32+rbp] + mov rax,QWORD PTR[16+rbp] adc rdx,0 add r8,r9 mov QWORD PTR[((48-8+8))+rcx*8+rsp],rbx @@ -1448,7 +1701,7 @@ $L$8x_reduce:: mul rbx add r10,rax - mov rax,QWORD PTR[48+rbp] + mov rax,QWORD PTR[24+rbp] adc rdx,0 add r9,r10 mov rsi,QWORD PTR[((32+8))+rsp] @@ -1457,7 +1710,7 @@ $L$8x_reduce:: mul rbx add r11,rax - mov rax,QWORD PTR[64+rbp] + mov rax,QWORD PTR[32+rbp] adc rdx,0 imul rsi,r8 add r10,r11 @@ -1466,7 +1719,7 @@ $L$8x_reduce:: mul rbx add r12,rax - mov rax,QWORD PTR[80+rbp] + mov rax,QWORD PTR[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1474,7 +1727,7 @@ $L$8x_reduce:: mul rbx add r13,rax - mov rax,QWORD PTR[96+rbp] + mov rax,QWORD PTR[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1482,7 +1735,7 @@ $L$8x_reduce:: mul rbx add r14,rax - mov rax,QWORD PTR[112+rbp] + mov rax,QWORD PTR[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1500,7 +1753,7 @@ $L$8x_reduce:: dec ecx jnz $L$8x_reduce - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] xor rax,rax mov rdx,QWORD PTR[((8+8))+rsp] cmp rbp,QWORD PTR[((0+8))+rsp] @@ -1526,14 +1779,14 @@ ALIGN 32 $L$8x_tail:: mul rbx add r8,rax - mov rax,QWORD PTR[16+rbp] + mov rax,QWORD PTR[8+rbp] mov QWORD PTR[rdi],r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD PTR[32+rbp] + mov rax,QWORD PTR[16+rbp] adc rdx,0 add r8,r9 lea rdi,QWORD PTR[8+rdi] @@ -1542,7 +1795,7 @@ $L$8x_tail:: mul rbx add r10,rax - mov rax,QWORD PTR[48+rbp] + mov rax,QWORD PTR[24+rbp] adc rdx,0 add r9,r10 mov r10,rdx @@ -1550,7 +1803,7 @@ $L$8x_tail:: mul rbx add r11,rax - mov rax,QWORD PTR[64+rbp] + mov rax,QWORD PTR[32+rbp] adc rdx,0 add r10,r11 mov r11,rdx @@ -1558,7 +1811,7 @@ $L$8x_tail:: mul rbx add r12,rax - mov rax,QWORD PTR[80+rbp] + mov rax,QWORD PTR[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1566,7 +1819,7 @@ $L$8x_tail:: mul rbx add r13,rax - mov rax,QWORD PTR[96+rbp] + mov rax,QWORD PTR[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1574,7 +1827,7 @@ $L$8x_tail:: mul rbx add r14,rax - mov rax,QWORD PTR[112+rbp] + mov rax,QWORD PTR[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1592,7 +1845,7 @@ $L$8x_tail:: dec ecx jnz $L$8x_tail - lea rbp,QWORD PTR[128+rbp] + lea rbp,QWORD PTR[64+rbp] mov rdx,QWORD PTR[((8+8))+rsp] cmp rbp,QWORD PTR[((0+8))+rsp] jae $L$8x_tail_done @@ -1638,7 +1891,7 @@ $L$8x_no_tail:: adc r14,QWORD PTR[48+rdi] adc r15,QWORD PTR[56+rdi] adc rax,0 - mov rcx,QWORD PTR[((-16))+rbp] + mov rcx,QWORD PTR[((-8))+rbp] xor rsi,rsi DB 102,72,15,126,213 @@ -1656,44 +1909,62 @@ DB 102,73,15,126,217 cmp rdi,rdx jb $L$8x_reduction_loop + DB 0F3h,0C3h ;repret +bn_sqr8x_internal ENDP - sub rcx,r15 +ALIGN 32 +__bn_post4x_internal PROC PRIVATE + mov r12,QWORD PTR[rbp] lea rbx,QWORD PTR[r9*1+rdi] - adc rsi,rsi mov rcx,r9 - or rax,rsi DB 102,72,15,126,207 - xor rax,1 + neg rax DB 102,72,15,126,206 - lea rbp,QWORD PTR[rax*8+rbp] sar rcx,3+2 - jmp $L$sqr4x_sub + dec r12 + xor r10,r10 + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] + jmp $L$sqr4x_sub_entry -ALIGN 32 +ALIGN 16 $L$sqr4x_sub:: -DB 066h - mov r12,QWORD PTR[rbx] - mov r13,QWORD PTR[8+rbx] - sbb r12,QWORD PTR[rbp] - mov r14,QWORD PTR[16+rbx] - sbb r13,QWORD PTR[16+rbp] - mov r15,QWORD PTR[24+rbx] - lea rbx,QWORD PTR[32+rbx] - sbb r14,QWORD PTR[32+rbp] + mov r12,QWORD PTR[rbp] + mov r13,QWORD PTR[8+rbp] + mov r14,QWORD PTR[16+rbp] + mov r15,QWORD PTR[24+rbp] +$L$sqr4x_sub_entry:: + lea rbp,QWORD PTR[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD PTR[rbx] + adc r13,QWORD PTR[8+rbx] + adc r14,QWORD PTR[16+rbx] + adc r15,QWORD PTR[24+rbx] mov QWORD PTR[rdi],r12 - sbb r15,QWORD PTR[48+rbp] - lea rbp,QWORD PTR[64+rbp] + lea rbx,QWORD PTR[32+rbx] mov QWORD PTR[8+rdi],r13 + sbb r10,r10 mov QWORD PTR[16+rdi],r14 mov QWORD PTR[24+rdi],r15 lea rdi,QWORD PTR[32+rdi] inc rcx jnz $L$sqr4x_sub + mov r10,r9 neg r9 DB 0F3h,0C3h ;repret -bn_sqr8x_internal ENDP +__bn_post4x_internal ENDP PUBLIC bn_from_montgomery ALIGN 32 @@ -1727,13 +1998,9 @@ DB 067h push r13 push r14 push r15 - lea rsp,QWORD PTR[((-40))+rsp] - movaps XMMWORD PTR[rsp],xmm6 - movaps XMMWORD PTR[16+rsp],xmm7 -DB 067h - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10,QWORD PTR[r9*2+r9] neg r9 mov r8,QWORD PTR[r8] @@ -1743,19 +2010,20 @@ DB 067h - lea r11,QWORD PTR[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,QWORD PTR[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb $L$from_sp_alt sub rsp,r11 - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] jmp $L$from_sp_done ALIGN 32 $L$from_sp_alt:: - lea r10,QWORD PTR[((4096-64))+r9*2] - lea rsp,QWORD PTR[((-64))+r9*2+rsp] + lea r10,QWORD PTR[((4096-320))+r9*2] + lea rsp,QWORD PTR[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -1806,7 +2074,8 @@ DB 102,72,15,110,209 DB 067h mov rbp,rcx DB 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor xmm0,xmm0 lea rax,QWORD PTR[48+rsp] @@ -1876,55 +2145,171 @@ bn_scatter5 ENDP PUBLIC bn_gather5 -ALIGN 16 +ALIGN 32 bn_gather5 PROC PUBLIC $L$SEH_begin_bn_gather5:: -DB 048h,083h,0ech,028h -DB 00fh,029h,034h,024h -DB 00fh,029h,07ch,024h,010h - mov r11d,r9d - shr r9d,3 - and r11,7 - not r9d - lea rax,QWORD PTR[$L$magic_masks] - and r9d,3 - lea r8,QWORD PTR[128+r11*8+r8] - movq xmm4,QWORD PTR[r9*8+rax] - movq xmm5,QWORD PTR[8+r9*8+rax] - movq xmm6,QWORD PTR[16+r9*8+rax] - movq xmm7,QWORD PTR[24+r9*8+rax] +DB 04ch,08dh,014h,024h +DB 048h,081h,0ech,008h,001h,000h,000h + lea rax,QWORD PTR[$L$inc] + and rsp,-16 + + movd xmm5,r9d + movdqa xmm0,XMMWORD PTR[rax] + movdqa xmm1,XMMWORD PTR[16+rax] + lea r11,QWORD PTR[128+r8] + lea rax,QWORD PTR[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD PTR[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD PTR[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD PTR[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD PTR[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD PTR[112+rax],xmm3 jmp $L$gather -ALIGN 16 -$L$gather:: - movq xmm0,QWORD PTR[((-128))+r8] - movq xmm1,QWORD PTR[((-64))+r8] - pand xmm0,xmm4 - movq xmm2,QWORD PTR[r8] - pand xmm1,xmm5 - movq xmm3,QWORD PTR[64+r8] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 -DB 067h,067h - por xmm0,xmm2 - lea r8,QWORD PTR[256+r8] - por xmm0,xmm3 +ALIGN 32 +$L$gather:: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD PTR[((-128))+r11] + movdqa xmm1,XMMWORD PTR[((-112))+r11] + movdqa xmm2,XMMWORD PTR[((-96))+r11] + pand xmm0,XMMWORD PTR[((-128))+rax] + movdqa xmm3,XMMWORD PTR[((-80))+r11] + pand xmm1,XMMWORD PTR[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[((-64))+r11] + movdqa xmm1,XMMWORD PTR[((-48))+r11] + movdqa xmm2,XMMWORD PTR[((-32))+r11] + pand xmm0,XMMWORD PTR[((-64))+rax] + movdqa xmm3,XMMWORD PTR[((-16))+r11] + pand xmm1,XMMWORD PTR[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[r11] + movdqa xmm1,XMMWORD PTR[16+r11] + movdqa xmm2,XMMWORD PTR[32+r11] + pand xmm0,XMMWORD PTR[rax] + movdqa xmm3,XMMWORD PTR[48+r11] + pand xmm1,XMMWORD PTR[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD PTR[64+r11] + movdqa xmm1,XMMWORD PTR[80+r11] + movdqa xmm2,XMMWORD PTR[96+r11] + pand xmm0,XMMWORD PTR[64+rax] + movdqa xmm3,XMMWORD PTR[112+r11] + pand xmm1,XMMWORD PTR[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD PTR[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD PTR[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,QWORD PTR[256+r11] + pshufd xmm0,xmm4,04eh + por xmm0,xmm4 movq QWORD PTR[rcx],xmm0 lea rcx,QWORD PTR[8+rcx] sub edx,1 jnz $L$gather - movaps xmm6,XMMWORD PTR[rsp] - movaps xmm7,XMMWORD PTR[16+rsp] - lea rsp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[r10] DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5:: bn_gather5 ENDP ALIGN 64 -$L$magic_masks:: - DD 0,0,0,0,0,0,-1,-1 - DD 0,0,0,0,0,0,0,0 +$L$inc:: + DD 0,0,1,1 + DD 2,2,2,2 DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 @@ -1966,19 +2351,16 @@ mul_handler PROC PRIVATE lea r10,QWORD PTR[$L$mul_epilogue] cmp rbx,r10 - jb $L$body_40 + ja $L$body_40 mov r10,QWORD PTR[192+r8] mov rax,QWORD PTR[8+r10*8+rax] + jmp $L$body_proceed $L$body_40:: mov rax,QWORD PTR[40+rax] $L$body_proceed:: - - movaps xmm0,XMMWORD PTR[((-88))+rax] - movaps xmm1,XMMWORD PTR[((-72))+rax] - mov rbx,QWORD PTR[((-8))+rax] mov rbp,QWORD PTR[((-16))+rax] mov r12,QWORD PTR[((-24))+rax] @@ -1991,8 +2373,6 @@ $L$body_proceed:: mov QWORD PTR[224+r8],r13 mov QWORD PTR[232+r8],r14 mov QWORD PTR[240+r8],r15 - movups XMMWORD PTR[512+r8],xmm0 - movups XMMWORD PTR[528+r8],xmm1 $L$common_seh_tail:: mov rdi,QWORD PTR[8+rax] @@ -2080,10 +2460,9 @@ DB 9,0,0,0 DD imagerel $L$from_body,imagerel $L$from_epilogue ALIGN 8 $L$SEH_info_bn_gather5:: -DB 001h,00dh,005h,000h -DB 00dh,078h,001h,000h -DB 008h,068h,000h,000h -DB 004h,042h,000h,000h +DB 001h,00bh,003h,00ah +DB 00bh,001h,021h,000h +DB 004h,0a3h,000h,000h ALIGN 8 .xdata ENDS diff --git a/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm b/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm index ef9b22fbfddc11..ca78bd52ccc9f2 100644 --- a/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm +++ b/deps/openssl/asm_obsolete/x64-win32-masm/ec/ecp_nistz256-x86_64.asm @@ -1303,6 +1303,7 @@ $L$SEH_begin_ecp_nistz256_point_double:: push r15 sub rsp,32*5+8 +$L$point_double_shortcutq:: movdqu xmm0,XMMWORD PTR[rsi] mov rbx,rsi movdqu xmm1,XMMWORD PTR[16+rsi] @@ -1577,6 +1578,7 @@ DB 102,72,15,110,199 mov r14,QWORD PTR[((64+8))+rbx] mov r15,QWORD PTR[((64+16))+rbx] mov r8,QWORD PTR[((64+24))+rbx] +DB 102,72,15,110,203 lea rsi,QWORD PTR[((64-0))+rbx] lea rdi,QWORD PTR[32+rsp] @@ -1668,7 +1670,7 @@ DB 102,73,15,126,217 test r8,r8 jnz $L$add_proceedq test r9,r9 - jz $L$add_proceedq + jz $L$add_doubleq DB 102,72,15,126,199 pxor xmm0,xmm0 @@ -1680,6 +1682,13 @@ DB 102,72,15,126,199 movdqu XMMWORD PTR[80+rdi],xmm0 jmp $L$add_doneq +ALIGN 32 +$L$add_doubleq:: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + jmp $L$point_double_shortcutq + ALIGN 32 $L$add_proceedq:: mov rax,QWORD PTR[((0+64))+rsp] diff --git a/deps/openssl/openssl/CHANGES b/deps/openssl/openssl/CHANGES index aa1f60d1441fcf..df4b6064ddb9e9 100644 --- a/deps/openssl/openssl/CHANGES +++ b/deps/openssl/openssl/CHANGES @@ -2,6 +2,138 @@ OpenSSL CHANGES _______________ + Changes between 1.0.2f and 1.0.2g [1 Mar 2016] + + * Disable weak ciphers in SSLv3 and up in default builds of OpenSSL. + Builds that are not configured with "enable-weak-ssl-ciphers" will not + provide any "EXPORT" or "LOW" strength ciphers. + [Viktor Dukhovni] + + * Disable SSLv2 default build, default negotiation and weak ciphers. SSLv2 + is by default disabled at build-time. Builds that are not configured with + "enable-ssl2" will not support SSLv2. Even if "enable-ssl2" is used, + users who want to negotiate SSLv2 via the version-flexible SSLv23_method() + will need to explicitly call either of: + + SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2); + or + SSL_clear_options(ssl, SSL_OP_NO_SSLv2); + + as appropriate. Even if either of those is used, or the application + explicitly uses the version-specific SSLv2_method() or its client and + server variants, SSLv2 ciphers vulnerable to exhaustive search key + recovery have been removed. Specifically, the SSLv2 40-bit EXPORT + ciphers, and SSLv2 56-bit DES are no longer available. + (CVE-2016-0800) + [Viktor Dukhovni] + + *) Fix a double-free in DSA code + + A double free bug was discovered when OpenSSL parses malformed DSA private + keys and could lead to a DoS attack or memory corruption for applications + that receive DSA private keys from untrusted sources. This scenario is + considered rare. + + This issue was reported to OpenSSL by Adam Langley(Google/BoringSSL) using + libFuzzer. + (CVE-2016-0705) + [Stephen Henson] + + *) Disable SRP fake user seed to address a server memory leak. + + Add a new method SRP_VBASE_get1_by_user that handles the seed properly. + + SRP_VBASE_get_by_user had inconsistent memory management behaviour. + In order to fix an unavoidable memory leak, SRP_VBASE_get_by_user + was changed to ignore the "fake user" SRP seed, even if the seed + is configured. + + Users should use SRP_VBASE_get1_by_user instead. Note that in + SRP_VBASE_get1_by_user, caller must free the returned value. Note + also that even though configuring the SRP seed attempts to hide + invalid usernames by continuing the handshake with fake + credentials, this behaviour is not constant time and no strong + guarantees are made that the handshake is indistinguishable from + that of a valid user. + (CVE-2016-0798) + [Emilia Käsper] + + *) Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption + + In the BN_hex2bn function the number of hex digits is calculated using an + int value |i|. Later |bn_expand| is called with a value of |i * 4|. For + large values of |i| this can result in |bn_expand| not allocating any + memory because |i * 4| is negative. This can leave the internal BIGNUM data + field as NULL leading to a subsequent NULL ptr deref. For very large values + of |i|, the calculation |i * 4| could be a positive value smaller than |i|. + In this case memory is allocated to the internal BIGNUM data field, but it + is insufficiently sized leading to heap corruption. A similar issue exists + in BN_dec2bn. This could have security consequences if BN_hex2bn/BN_dec2bn + is ever called by user applications with very large untrusted hex/dec data. + This is anticipated to be a rare occurrence. + + All OpenSSL internal usage of these functions use data that is not expected + to be untrusted, e.g. config file data or application command line + arguments. If user developed applications generate config file data based + on untrusted data then it is possible that this could also lead to security + consequences. This is also anticipated to be rare. + + This issue was reported to OpenSSL by Guido Vranken. + (CVE-2016-0797) + [Matt Caswell] + + *) Fix memory issues in BIO_*printf functions + + The internal |fmtstr| function used in processing a "%s" format string in + the BIO_*printf functions could overflow while calculating the length of a + string and cause an OOB read when printing very long strings. + + Additionally the internal |doapr_outch| function can attempt to write to an + OOB memory location (at an offset from the NULL pointer) in the event of a + memory allocation failure. In 1.0.2 and below this could be caused where + the size of a buffer to be allocated is greater than INT_MAX. E.g. this + could be in processing a very long "%s" format string. Memory leaks can + also occur. + + The first issue may mask the second issue dependent on compiler behaviour. + These problems could enable attacks where large amounts of untrusted data + is passed to the BIO_*printf functions. If applications use these functions + in this way then they could be vulnerable. OpenSSL itself uses these + functions when printing out human-readable dumps of ASN.1 data. Therefore + applications that print this data could be vulnerable if the data is from + untrusted sources. OpenSSL command line applications could also be + vulnerable where they print out ASN.1 data, or if untrusted data is passed + as command line arguments. + + Libssl is not considered directly vulnerable. Additionally certificates etc + received via remote connections via libssl are also unlikely to be able to + trigger these issues because of message size limits enforced within libssl. + + This issue was reported to OpenSSL Guido Vranken. + (CVE-2016-0799) + [Matt Caswell] + + *) Side channel attack on modular exponentiation + + A side-channel attack was found which makes use of cache-bank conflicts on + the Intel Sandy-Bridge microarchitecture which could lead to the recovery + of RSA keys. The ability to exploit this issue is limited as it relies on + an attacker who has control of code in a thread running on the same + hyper-threaded core as the victim thread which is performing decryptions. + + This issue was reported to OpenSSL by Yuval Yarom, The University of + Adelaide and NICTA, Daniel Genkin, Technion and Tel Aviv University, and + Nadia Heninger, University of Pennsylvania with more information at + http://cachebleed.info. + (CVE-2016-0702) + [Andy Polyakov] + + *) Change the req app to generate a 2048-bit RSA/DSA key by default, + if no keysize is specified with default_bits. This fixes an + omission in an earlier change that changed all RSA/DSA key generation + apps to use 2048 bits by default. + [Emilia Käsper] + Changes between 1.0.2e and 1.0.2f [28 Jan 2016] *) DH small subgroups @@ -105,7 +237,7 @@ [Emilia Käsper] *) In DSA_generate_parameters_ex, if the provided seed is too short, - return an error + use a random seed, as already documented. [Rich Salz and Ismo Puustinen ] Changes between 1.0.2c and 1.0.2d [9 Jul 2015] diff --git a/deps/openssl/openssl/Configure b/deps/openssl/openssl/Configure index 4a715dc4373234..c98107a487188f 100755 --- a/deps/openssl/openssl/Configure +++ b/deps/openssl/openssl/Configure @@ -58,6 +58,10 @@ my $usage="Usage: Configure [no- ...] [enable- ...] [experimenta # library and will be loaded in run-time by the OpenSSL library. # sctp include SCTP support # 386 generate 80386 code +# enable-weak-ssl-ciphers +# Enable EXPORT and LOW SSLv3 ciphers that are disabled by +# default. Note, weak SSLv2 ciphers are unconditionally +# disabled. # no-sse2 disables IA-32 SSE2 code, above option implies no-sse2 # no- build without specified algorithm (rsa, idea, rc5, ...) # - + compiler options are passed through @@ -781,11 +785,13 @@ my %disabled = ( # "what" => "comment" [or special keyword "experimental "md2" => "default", "rc5" => "default", "rfc3779" => "default", - "sctp" => "default", + "sctp" => "default", "shared" => "default", "ssl-trace" => "default", + "ssl2" => "default", "store" => "experimental", "unit-test" => "default", + "weak-ssl-ciphers" => "default", "zlib" => "default", "zlib-dynamic" => "default" ); diff --git a/deps/openssl/openssl/Makefile.shared b/deps/openssl/openssl/Makefile.shared index e753f44e18fdf6..a2aa9804c1d98d 100644 --- a/deps/openssl/openssl/Makefile.shared +++ b/deps/openssl/openssl/Makefile.shared @@ -272,7 +272,7 @@ link_o.cygwin: SHLIB_SOVER=${LIBVERSION:+"-$(LIBVERSION)"}; \ ALLSYMSFLAGS='-Wl,--whole-archive'; \ NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \ - SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-s,-Bsymbolic"; \ + SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base $$deffile -Wl,-Bsymbolic"; \ $(LINK_SO_O) #for mingw target if def-file is in use dll-name should match library-name link_a.cygwin: @@ -289,7 +289,7 @@ link_a.cygwin: SHLIB_SOVER=32; \ extras="$(LIBNAME).def"; \ $(PERL) util/mkdef.pl 32 $$SHLIB > $$extras; \ - base=; [ $(LIBNAME) = "crypto" ] && base=-Wl,--image-base,0x63000000; \ + base=; [ $(LIBNAME) = "crypto" -a -n "$(FIPSCANLIB)" ] && base=-Wl,--image-base,0x63000000; \ fi; \ dll_name=$$SHLIB$$SHLIB_SOVER$$SHLIB_SUFFIX; \ $(PERL) util/mkrc.pl $$dll_name | \ @@ -297,7 +297,7 @@ link_a.cygwin: extras="$$extras rc.o"; \ ALLSYMSFLAGS='-Wl,--whole-archive'; \ NOALLSYMSFLAGS='-Wl,--no-whole-archive'; \ - SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-s,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \ + SHAREDFLAGS="$(CFLAGS) $(SHARED_LDFLAGS) -shared $$base -Wl,-Bsymbolic -Wl,--out-implib,lib$(LIBNAME).dll.a $$extras"; \ [ -f apps/$$dll_name ] && rm apps/$$dll_name; \ [ -f test/$$dll_name ] && rm test/$$dll_name; \ $(LINK_SO_A) || exit 1; \ diff --git a/deps/openssl/openssl/NEWS b/deps/openssl/openssl/NEWS index 06c77025e999d7..33242c83624de2 100644 --- a/deps/openssl/openssl/NEWS +++ b/deps/openssl/openssl/NEWS @@ -5,6 +5,19 @@ This file gives a brief overview of the major changes between each OpenSSL release. For more details please read the CHANGES file. + Major changes between OpenSSL 1.0.2f and OpenSSL 1.0.2g [1 Mar 2016] + + o Disable weak ciphers in SSLv3 and up in default builds of OpenSSL. + o Disable SSLv2 default build, default negotiation and weak ciphers + (CVE-2016-0800) + o Fix a double-free in DSA code (CVE-2016-0705) + o Disable SRP fake user seed to address a server memory leak + (CVE-2016-0798) + o Fix BN_hex2bn/BN_dec2bn NULL pointer deref/heap corruption + (CVE-2016-0797) + o Fix memory issues in BIO_*printf functions (CVE-2016-0799) + o Fix side channel attack on modular exponentiation (CVE-2016-0702) + Major changes between OpenSSL 1.0.2e and OpenSSL 1.0.2f [28 Jan 2016] o DH small subgroups (CVE-2016-0701) diff --git a/deps/openssl/openssl/README b/deps/openssl/openssl/README index 1e9869daee000d..2077b04eb271d4 100644 --- a/deps/openssl/openssl/README +++ b/deps/openssl/openssl/README @@ -1,5 +1,5 @@ - OpenSSL 1.0.2f 28 Jan 2016 + OpenSSL 1.0.2g 1 Mar 2016 Copyright (c) 1998-2015 The OpenSSL Project Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson diff --git a/deps/openssl/openssl/apps/apps.c b/deps/openssl/openssl/apps/apps.c index 2e778054ca8fce..b1dd97038f7d7c 100644 --- a/deps/openssl/openssl/apps/apps.c +++ b/deps/openssl/openssl/apps/apps.c @@ -2442,7 +2442,11 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in) else len = 1024; len = BIO_read(in, tbuf, len); - if (len <= 0) + if (len < 0) { + BIO_free(mem); + return -1; + } + if (len == 0) break; if (BIO_write(mem, tbuf, len) != len) { BIO_free(mem); @@ -2459,7 +2463,7 @@ int bio_to_mem(unsigned char **out, int maxlen, BIO *in) return ret; } -int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value) +int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value) { int rv; char *stmp, *vtmp = NULL; diff --git a/deps/openssl/openssl/apps/apps.h b/deps/openssl/openssl/apps/apps.h index 8276e708694dd1..19bf5cc3337d7d 100644 --- a/deps/openssl/openssl/apps/apps.h +++ b/deps/openssl/openssl/apps/apps.h @@ -321,7 +321,7 @@ int args_verify(char ***pargs, int *pargc, int *badarg, BIO *err, X509_VERIFY_PARAM **pm); void policies_print(BIO *out, X509_STORE_CTX *ctx); int bio_to_mem(unsigned char **out, int maxlen, BIO *in); -int pkey_ctrl_string(EVP_PKEY_CTX *ctx, char *value); +int pkey_ctrl_string(EVP_PKEY_CTX *ctx, const char *value); int init_gen_str(BIO *err, EVP_PKEY_CTX **pctx, const char *algname, ENGINE *e, int do_param); int do_X509_sign(BIO *err, X509 *x, EVP_PKEY *pkey, const EVP_MD *md, diff --git a/deps/openssl/openssl/apps/pkeyutl.c b/deps/openssl/openssl/apps/pkeyutl.c index 501fd6304ad0e8..39faa451ab8df0 100644 --- a/deps/openssl/openssl/apps/pkeyutl.c +++ b/deps/openssl/openssl/apps/pkeyutl.c @@ -73,7 +73,7 @@ static void usage(void); #define PROG pkeyutl_main static EVP_PKEY_CTX *init_ctx(int *pkeysize, - char *keyfile, int keyform, int key_type, + const char *keyfile, int keyform, int key_type, char *passargin, int pkey_op, ENGINE *e, int impl); @@ -99,10 +99,12 @@ int MAIN(int argc, char **argv) char *passargin = NULL; int keysize = -1; int engine_impl = 0; - unsigned char *buf_in = NULL, *buf_out = NULL, *sig = NULL; - size_t buf_outlen; + size_t buf_outlen = 0; int buf_inlen = 0, siglen = -1; + const char *inkey = NULL; + const char *peerkey = NULL; + STACK_OF(OPENSSL_STRING) *pkeyopts = NULL; int ret = 1, rv = -1; @@ -136,21 +138,13 @@ int MAIN(int argc, char **argv) } else if (!strcmp(*argv, "-inkey")) { if (--argc < 1) badarg = 1; - else { - ctx = init_ctx(&keysize, - *(++argv), keyform, key_type, - passargin, pkey_op, e, engine_impl); - if (!ctx) { - BIO_puts(bio_err, "Error initializing context\n"); - ERR_print_errors(bio_err); - badarg = 1; - } - } + else + inkey = *++argv; } else if (!strcmp(*argv, "-peerkey")) { if (--argc < 1) badarg = 1; - else if (!setup_peer(bio_err, ctx, peerform, *(++argv), e)) - badarg = 1; + else + peerkey = *++argv; } else if (!strcmp(*argv, "-passin")) { if (--argc < 1) badarg = 1; @@ -191,23 +185,21 @@ int MAIN(int argc, char **argv) pkey_op = EVP_PKEY_OP_VERIFY; else if (!strcmp(*argv, "-verifyrecover")) pkey_op = EVP_PKEY_OP_VERIFYRECOVER; - else if (!strcmp(*argv, "-rev")) - rev = 1; else if (!strcmp(*argv, "-encrypt")) pkey_op = EVP_PKEY_OP_ENCRYPT; else if (!strcmp(*argv, "-decrypt")) pkey_op = EVP_PKEY_OP_DECRYPT; else if (!strcmp(*argv, "-derive")) pkey_op = EVP_PKEY_OP_DERIVE; + else if (!strcmp(*argv, "-rev")) + rev = 1; else if (strcmp(*argv, "-pkeyopt") == 0) { if (--argc < 1) badarg = 1; - else if (!ctx) { - BIO_puts(bio_err, "-pkeyopt command before -inkey\n"); - badarg = 1; - } else if (pkey_ctrl_string(ctx, *(++argv)) <= 0) { - BIO_puts(bio_err, "parameter setting error\n"); - ERR_print_errors(bio_err); + else if ((pkeyopts == NULL && + (pkeyopts = sk_OPENSSL_STRING_new_null()) == NULL) || + sk_OPENSSL_STRING_push(pkeyopts, *++argv) == 0) { + BIO_puts(bio_err, "out of memory\n"); goto end; } } else @@ -220,10 +212,37 @@ int MAIN(int argc, char **argv) argv++; } - if (!ctx) { + if (inkey == NULL || + (peerkey != NULL && pkey_op != EVP_PKEY_OP_DERIVE)) { usage(); goto end; } + ctx = init_ctx(&keysize, inkey, keyform, key_type, + passargin, pkey_op, e, engine_impl); + if (!ctx) { + BIO_puts(bio_err, "Error initializing context\n"); + ERR_print_errors(bio_err); + goto end; + } + if (peerkey != NULL && !setup_peer(bio_err, ctx, peerform, peerkey, e)) { + BIO_puts(bio_err, "Error setting up peer key\n"); + ERR_print_errors(bio_err); + goto end; + } + if (pkeyopts != NULL) { + int num = sk_OPENSSL_STRING_num(pkeyopts); + int i; + + for (i = 0; i < num; ++i) { + const char *opt = sk_OPENSSL_STRING_value(pkeyopts, i); + + if (pkey_ctrl_string(ctx, opt) <= 0) { + BIO_puts(bio_err, "parameter setting error\n"); + ERR_print_errors(bio_err); + goto end; + } + } + } if (sigfile && (pkey_op != EVP_PKEY_OP_VERIFY)) { BIO_puts(bio_err, "Signature file specified for non verify\n"); @@ -273,7 +292,7 @@ int MAIN(int argc, char **argv) } siglen = bio_to_mem(&sig, keysize * 10, sigbio); BIO_free(sigbio); - if (siglen <= 0) { + if (siglen < 0) { BIO_printf(bio_err, "Error reading signature data\n"); goto end; } @@ -282,7 +301,7 @@ int MAIN(int argc, char **argv) if (in) { /* Read the input data */ buf_inlen = bio_to_mem(&buf_in, keysize * 10, in); - if (buf_inlen <= 0) { + if (buf_inlen < 0) { BIO_printf(bio_err, "Error reading input Data\n"); exit(1); } @@ -310,7 +329,7 @@ int MAIN(int argc, char **argv) } else { rv = do_keyop(ctx, pkey_op, NULL, (size_t *)&buf_outlen, buf_in, (size_t)buf_inlen); - if (rv > 0) { + if (rv > 0 && buf_outlen != 0) { buf_out = OPENSSL_malloc(buf_outlen); if (!buf_out) rv = -1; @@ -340,12 +359,14 @@ int MAIN(int argc, char **argv) EVP_PKEY_CTX_free(ctx); BIO_free(in); BIO_free_all(out); - if (buf_in) + if (buf_in != NULL) OPENSSL_free(buf_in); - if (buf_out) + if (buf_out != NULL) OPENSSL_free(buf_out); - if (sig) + if (sig != NULL) OPENSSL_free(sig); + if (pkeyopts != NULL) + sk_OPENSSL_STRING_free(pkeyopts); return ret; } @@ -380,7 +401,7 @@ static void usage() } static EVP_PKEY_CTX *init_ctx(int *pkeysize, - char *keyfile, int keyform, int key_type, + const char *keyfile, int keyform, int key_type, char *passargin, int pkey_op, ENGINE *e, int engine_impl) { @@ -484,14 +505,9 @@ static int setup_peer(BIO *err, EVP_PKEY_CTX *ctx, int peerform, EVP_PKEY *peer = NULL; ENGINE* engine = NULL; int ret; - if (!ctx) { - BIO_puts(err, "-peerkey command before -inkey\n"); - return 0; - } if (peerform == FORMAT_ENGINE) - engine = e; - + engine = e; peer = load_pubkey(bio_err, file, peerform, 0, NULL, engine, "Peer Key"); if (!peer) { diff --git a/deps/openssl/openssl/apps/req.c b/deps/openssl/openssl/apps/req.c index 57781c93c4cafe..e818bd2976d6db 100644 --- a/deps/openssl/openssl/apps/req.c +++ b/deps/openssl/openssl/apps/req.c @@ -101,8 +101,8 @@ #define STRING_MASK "string_mask" #define UTF8_IN "utf8" -#define DEFAULT_KEY_LENGTH 512 -#define MIN_KEY_LENGTH 384 +#define DEFAULT_KEY_LENGTH 2048 +#define MIN_KEY_LENGTH 512 #undef PROG #define PROG req_main diff --git a/deps/openssl/openssl/apps/rsautl.c b/deps/openssl/openssl/apps/rsautl.c index d642f9ad97f3c1..5b6f849ea74d6e 100644 --- a/deps/openssl/openssl/apps/rsautl.c +++ b/deps/openssl/openssl/apps/rsautl.c @@ -250,7 +250,7 @@ int MAIN(int argc, char **argv) if (outfile) { if (!(out = BIO_new_file(outfile, "wb"))) { - BIO_printf(bio_err, "Error Reading Output File\n"); + BIO_printf(bio_err, "Error Writing Output File\n"); ERR_print_errors(bio_err); goto end; } @@ -276,7 +276,7 @@ int MAIN(int argc, char **argv) /* Read the input data */ rsa_inlen = BIO_read(in, rsa_in, keysize * 2); - if (rsa_inlen <= 0) { + if (rsa_inlen < 0) { BIO_printf(bio_err, "Error reading input Data\n"); exit(1); } @@ -311,7 +311,7 @@ int MAIN(int argc, char **argv) } - if (rsa_outlen <= 0) { + if (rsa_outlen < 0) { BIO_printf(bio_err, "RSA operation error\n"); ERR_print_errors(bio_err); goto end; diff --git a/deps/openssl/openssl/apps/s_client.c b/deps/openssl/openssl/apps/s_client.c index 2abef8869a1310..bc8004a5551524 100644 --- a/deps/openssl/openssl/apps/s_client.c +++ b/deps/openssl/openssl/apps/s_client.c @@ -398,8 +398,6 @@ static void sc_usage(void) " -no_tls1_2/-no_tls1_1/-no_tls1/-no_ssl3/-no_ssl2 - turn off that protocol\n"); BIO_printf(bio_err, " -bugs - Switch on all SSL implementation bug workarounds\n"); - BIO_printf(bio_err, - " -serverpref - Use server's cipher preferences (only SSLv2)\n"); BIO_printf(bio_err, " -cipher - preferred cipher to use, use the 'openssl ciphers'\n"); BIO_printf(bio_err, diff --git a/deps/openssl/openssl/apps/s_server.c b/deps/openssl/openssl/apps/s_server.c index 65cbaaf6eb9b2e..09c755b55cfe16 100644 --- a/deps/openssl/openssl/apps/s_server.c +++ b/deps/openssl/openssl/apps/s_server.c @@ -429,6 +429,8 @@ typedef struct srpsrvparm_st { static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg) { srpsrvparm *p = (srpsrvparm *) arg; + int ret = SSL3_AL_FATAL; + if (p->login == NULL && p->user == NULL) { p->login = SSL_get_srp_username(s); BIO_printf(bio_err, "SRP username = \"%s\"\n", p->login); @@ -437,21 +439,25 @@ static int MS_CALLBACK ssl_srp_server_param_cb(SSL *s, int *ad, void *arg) if (p->user == NULL) { BIO_printf(bio_err, "User %s doesn't exist\n", p->login); - return SSL3_AL_FATAL; + goto err; } + if (SSL_set_srp_server_param (s, p->user->N, p->user->g, p->user->s, p->user->v, p->user->info) < 0) { *ad = SSL_AD_INTERNAL_ERROR; - return SSL3_AL_FATAL; + goto err; } BIO_printf(bio_err, "SRP parameters set: username = \"%s\" info=\"%s\" \n", p->login, p->user->info); - /* need to check whether there are memory leaks */ + ret = SSL_ERROR_NONE; + +err: + SRP_user_pwd_free(p->user); p->user = NULL; p->login = NULL; - return SSL_ERROR_NONE; + return ret; } #endif @@ -2452,9 +2458,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context) #ifndef OPENSSL_NO_SRP while (SSL_get_error(con, k) == SSL_ERROR_WANT_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP renego during write\n"); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -2508,9 +2515,10 @@ static int sv_body(char *hostname, int s, int stype, unsigned char *context) #ifndef OPENSSL_NO_SRP while (SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP renego during read\n"); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -2605,9 +2613,10 @@ static int init_ssl_connection(SSL *con) while (i <= 0 && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP during accept %s\n", srp_callback_parm.login); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -2849,9 +2858,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context) && SSL_get_error(con, i) == SSL_ERROR_WANT_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP during accept %s\n", srp_callback_parm.login); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -2891,9 +2901,10 @@ static int www_body(char *hostname, int s, int stype, unsigned char *context) if (BIO_should_io_special(io) && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP renego during read\n"); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -3236,9 +3247,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context) if (BIO_should_io_special(io) && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP renego during accept\n"); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); @@ -3264,9 +3276,10 @@ static int rev_body(char *hostname, int s, int stype, unsigned char *context) if (BIO_should_io_special(io) && BIO_get_retry_reason(io) == BIO_RR_SSL_X509_LOOKUP) { BIO_printf(bio_s_out, "LOOKUP renego during read\n"); + SRP_user_pwd_free(srp_callback_parm.user); srp_callback_parm.user = - SRP_VBASE_get_by_user(srp_callback_parm.vb, - srp_callback_parm.login); + SRP_VBASE_get1_by_user(srp_callback_parm.vb, + srp_callback_parm.login); if (srp_callback_parm.user) BIO_printf(bio_s_out, "LOOKUP done %s\n", srp_callback_parm.user->info); diff --git a/deps/openssl/openssl/config b/deps/openssl/openssl/config index 77f730f093e61b..bba370c4f3f1cb 100755 --- a/deps/openssl/openssl/config +++ b/deps/openssl/openssl/config @@ -852,7 +852,8 @@ case "$GUESSOS" in # *-dgux) OUT="dgux" ;; mips-sony-newsos4) OUT="newsos4-gcc" ;; *-*-cygwin_pre1.3) OUT="Cygwin-pre1.3" ;; - *-*-cygwin) OUT="Cygwin" ;; + i[3456]86-*-cygwin) OUT="Cygwin" ;; + *-*-cygwin) OUT="Cygwin-${MACHINE}" ;; t3e-cray-unicosmk) OUT="cray-t3e" ;; j90-cray-unicos) OUT="cray-j90" ;; nsr-tandem-nsk) OUT="tandem-c89" ;; diff --git a/deps/openssl/openssl/crypto/asn1/tasn_dec.c b/deps/openssl/openssl/crypto/asn1/tasn_dec.c index 9256049d158814..5a507967c894f2 100644 --- a/deps/openssl/openssl/crypto/asn1/tasn_dec.c +++ b/deps/openssl/openssl/crypto/asn1/tasn_dec.c @@ -717,7 +717,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, long plen; char cst, inf, free_cont = 0; const unsigned char *p; - BUF_MEM buf; + BUF_MEM buf = { 0, NULL, 0 }; const unsigned char *cont = NULL; long len; if (!pval) { @@ -793,7 +793,6 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, } else { len = p - cont + plen; p += plen; - buf.data = NULL; } } else if (cst) { if (utype == V_ASN1_NULL || utype == V_ASN1_BOOLEAN @@ -802,9 +801,9 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ASN1_R_TYPE_NOT_PRIMITIVE); return 0; } - buf.length = 0; - buf.max = 0; - buf.data = NULL; + + /* Free any returned 'buf' content */ + free_cont = 1; /* * Should really check the internal tags are correct but some things * may get this wrong. The relevant specs say that constructed string @@ -812,18 +811,16 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, * So instead just check for UNIVERSAL class and ignore the tag. */ if (!asn1_collect(&buf, &p, plen, inf, -1, V_ASN1_UNIVERSAL, 0)) { - free_cont = 1; goto err; } len = buf.length; /* Append a final null to string */ if (!BUF_MEM_grow_clean(&buf, len + 1)) { ASN1err(ASN1_F_ASN1_D2I_EX_PRIMITIVE, ERR_R_MALLOC_FAILURE); - return 0; + goto err; } buf.data[len] = 0; cont = (const unsigned char *)buf.data; - free_cont = 1; } else { cont = p; len = plen; @@ -831,6 +828,7 @@ static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, } /* We now have content length and type: translate into a structure */ + /* asn1_ex_c2i may reuse allocated buffer, and so sets free_cont to 0 */ if (!asn1_ex_c2i(pval, cont, len, utype, &free_cont, it)) goto err; diff --git a/deps/openssl/openssl/crypto/bio/b_print.c b/deps/openssl/openssl/crypto/bio/b_print.c index 7c81e25d482cf1..90248fa2aabac7 100644 --- a/deps/openssl/openssl/crypto/bio/b_print.c +++ b/deps/openssl/openssl/crypto/bio/b_print.c @@ -125,16 +125,16 @@ # define LLONG long #endif -static void fmtstr(char **, char **, size_t *, size_t *, - const char *, int, int, int); -static void fmtint(char **, char **, size_t *, size_t *, - LLONG, int, int, int, int); -static void fmtfp(char **, char **, size_t *, size_t *, - LDOUBLE, int, int, int); -static void doapr_outch(char **, char **, size_t *, size_t *, int); -static void _dopr(char **sbuffer, char **buffer, - size_t *maxlen, size_t *retlen, int *truncated, - const char *format, va_list args); +static int fmtstr(char **, char **, size_t *, size_t *, + const char *, int, int, int); +static int fmtint(char **, char **, size_t *, size_t *, + LLONG, int, int, int, int); +static int fmtfp(char **, char **, size_t *, size_t *, + LDOUBLE, int, int, int); +static int doapr_outch(char **, char **, size_t *, size_t *, int); +static int _dopr(char **sbuffer, char **buffer, + size_t *maxlen, size_t *retlen, int *truncated, + const char *format, va_list args); /* format read states */ #define DP_S_DEFAULT 0 @@ -165,7 +165,7 @@ static void _dopr(char **sbuffer, char **buffer, #define char_to_int(p) (p - '0') #define OSSL_MAX(p,q) ((p >= q) ? p : q) -static void +static int _dopr(char **sbuffer, char **buffer, size_t *maxlen, @@ -196,7 +196,8 @@ _dopr(char **sbuffer, if (ch == '%') state = DP_S_FLAGS; else - doapr_outch(sbuffer, buffer, &currlen, maxlen, ch); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch)) + return 0; ch = *format++; break; case DP_S_FLAGS: @@ -302,8 +303,9 @@ _dopr(char **sbuffer, value = va_arg(args, int); break; } - fmtint(sbuffer, buffer, &currlen, maxlen, - value, 10, min, max, flags); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, 10, min, + max, flags)) + return 0; break; case 'X': flags |= DP_F_UP; @@ -326,17 +328,19 @@ _dopr(char **sbuffer, value = (LLONG) va_arg(args, unsigned int); break; } - fmtint(sbuffer, buffer, &currlen, maxlen, value, - ch == 'o' ? 8 : (ch == 'u' ? 10 : 16), - min, max, flags); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, value, + ch == 'o' ? 8 : (ch == 'u' ? 10 : 16), + min, max, flags)) + return 0; break; case 'f': if (cflags == DP_C_LDOUBLE) fvalue = va_arg(args, LDOUBLE); else fvalue = va_arg(args, double); - fmtfp(sbuffer, buffer, &currlen, maxlen, - fvalue, min, max, flags); + if (!fmtfp(sbuffer, buffer, &currlen, maxlen, fvalue, min, max, + flags)) + return 0; break; case 'E': flags |= DP_F_UP; @@ -355,8 +359,9 @@ _dopr(char **sbuffer, fvalue = va_arg(args, double); break; case 'c': - doapr_outch(sbuffer, buffer, &currlen, maxlen, - va_arg(args, int)); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, + va_arg(args, int))) + return 0; break; case 's': strvalue = va_arg(args, char *); @@ -366,13 +371,15 @@ _dopr(char **sbuffer, else max = *maxlen; } - fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue, - flags, min, max); + if (!fmtstr(sbuffer, buffer, &currlen, maxlen, strvalue, + flags, min, max)) + return 0; break; case 'p': value = (long)va_arg(args, void *); - fmtint(sbuffer, buffer, &currlen, maxlen, - value, 16, min, max, flags | DP_F_NUM); + if (!fmtint(sbuffer, buffer, &currlen, maxlen, + value, 16, min, max, flags | DP_F_NUM)) + return 0; break; case 'n': /* XXX */ if (cflags == DP_C_SHORT) { @@ -394,7 +401,8 @@ _dopr(char **sbuffer, } break; case '%': - doapr_outch(sbuffer, buffer, &currlen, maxlen, ch); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, ch)) + return 0; break; case 'w': /* not supported yet, treat as next char */ @@ -418,46 +426,56 @@ _dopr(char **sbuffer, *truncated = (currlen > *maxlen - 1); if (*truncated) currlen = *maxlen - 1; - doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0'); + if(!doapr_outch(sbuffer, buffer, &currlen, maxlen, '\0')) + return 0; *retlen = currlen - 1; - return; + return 1; } -static void +static int fmtstr(char **sbuffer, char **buffer, size_t *currlen, size_t *maxlen, const char *value, int flags, int min, int max) { - int padlen, strln; + int padlen; + size_t strln; int cnt = 0; if (value == 0) value = ""; - for (strln = 0; value[strln]; ++strln) ; + + strln = strlen(value); + if (strln > INT_MAX) + strln = INT_MAX; + padlen = min - strln; - if (padlen < 0) + if (min < 0 || padlen < 0) padlen = 0; if (flags & DP_F_MINUS) padlen = -padlen; while ((padlen > 0) && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --padlen; ++cnt; } while (*value && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, *value++); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *value++)) + return 0; ++cnt; } while ((padlen < 0) && (cnt < max)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++padlen; ++cnt; } + return 1; } -static void +static int fmtint(char **sbuffer, char **buffer, size_t *currlen, @@ -517,37 +535,44 @@ fmtint(char **sbuffer, /* spaces */ while (spadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --spadlen; } /* sign */ if (signvalue) - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; /* prefix */ while (*prefix) { - doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, *prefix)) + return 0; prefix++; } /* zeros */ if (zpadlen > 0) { while (zpadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --zpadlen; } } /* digits */ - while (place > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place]); + while (place > 0) { + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, convert[--place])) + return 0; + } /* left justified spaces */ while (spadlen < 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++spadlen; } - return; + return 1; } static LDOUBLE abs_val(LDOUBLE value) @@ -578,7 +603,7 @@ static long roundv(LDOUBLE value) return intpart; } -static void +static int fmtfp(char **sbuffer, char **buffer, size_t *currlen, @@ -657,47 +682,61 @@ fmtfp(char **sbuffer, if ((flags & DP_F_ZERO) && (padlen > 0)) { if (signvalue) { - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; --padlen; signvalue = 0; } while (padlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --padlen; } } while (padlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; --padlen; } - if (signvalue) - doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue); + if (signvalue && !doapr_outch(sbuffer, buffer, currlen, maxlen, signvalue)) + return 0; - while (iplace > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace]); + while (iplace > 0) { + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, iconvert[--iplace])) + return 0; + } /* * Decimal point. This should probably use locale to find the correct * char to print out. */ if (max > 0 || (flags & DP_F_NUM)) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '.'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '.')) + return 0; - while (fplace > 0) - doapr_outch(sbuffer, buffer, currlen, maxlen, fconvert[--fplace]); + while (fplace > 0) { + if(!doapr_outch(sbuffer, buffer, currlen, maxlen, + fconvert[--fplace])) + return 0; + } } while (zpadlen > 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, '0'); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, '0')) + return 0; --zpadlen; } while (padlen < 0) { - doapr_outch(sbuffer, buffer, currlen, maxlen, ' '); + if (!doapr_outch(sbuffer, buffer, currlen, maxlen, ' ')) + return 0; ++padlen; } + return 1; } -static void +#define BUFFER_INC 1024 + +static int doapr_outch(char **sbuffer, char **buffer, size_t *currlen, size_t *maxlen, int c) { @@ -708,24 +747,25 @@ doapr_outch(char **sbuffer, assert(*currlen <= *maxlen); if (buffer && *currlen == *maxlen) { - *maxlen += 1024; + if (*maxlen > INT_MAX - BUFFER_INC) + return 0; + + *maxlen += BUFFER_INC; if (*buffer == NULL) { *buffer = OPENSSL_malloc(*maxlen); - if (!*buffer) { - /* Panic! Can't really do anything sensible. Just return */ - return; - } + if (*buffer == NULL) + return 0; if (*currlen > 0) { assert(*sbuffer != NULL); memcpy(*buffer, *sbuffer, *currlen); } *sbuffer = NULL; } else { - *buffer = OPENSSL_realloc(*buffer, *maxlen); - if (!*buffer) { - /* Panic! Can't really do anything sensible. Just return */ - return; - } + char *tmpbuf; + tmpbuf = OPENSSL_realloc(*buffer, *maxlen); + if (tmpbuf == NULL) + return 0; + *buffer = tmpbuf; } } @@ -736,7 +776,7 @@ doapr_outch(char **sbuffer, (*buffer)[(*currlen)++] = (char)c; } - return; + return 1; } /***************************************************************************/ @@ -768,7 +808,11 @@ int BIO_vprintf(BIO *bio, const char *format, va_list args) dynbuf = NULL; CRYPTO_push_info("doapr()"); - _dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, args); + if (!_dopr(&hugebufp, &dynbuf, &hugebufsize, &retlen, &ignored, format, + args)) { + OPENSSL_free(dynbuf); + return -1; + } if (dynbuf) { ret = BIO_write(bio, dynbuf, (int)retlen); OPENSSL_free(dynbuf); @@ -803,7 +847,8 @@ int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args) size_t retlen; int truncated; - _dopr(&buf, NULL, &n, &retlen, &truncated, format, args); + if(!_dopr(&buf, NULL, &n, &retlen, &truncated, format, args)) + return -1; if (truncated) /* diff --git a/deps/openssl/openssl/crypto/bio/bio.h b/deps/openssl/openssl/crypto/bio/bio.h index 6e2293bc66daad..6790aed28e0bae 100644 --- a/deps/openssl/openssl/crypto/bio/bio.h +++ b/deps/openssl/openssl/crypto/bio/bio.h @@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo { # define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0) # define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1) # define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2) -# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL) +# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL) # define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) @@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi, long argl, long ret); BIO_METHOD *BIO_s_mem(void); -BIO *BIO_new_mem_buf(void *buf, int len); +BIO *BIO_new_mem_buf(const void *buf, int len); BIO_METHOD *BIO_s_socket(void); BIO_METHOD *BIO_s_connect(void); BIO_METHOD *BIO_s_accept(void); diff --git a/deps/openssl/openssl/crypto/bio/bss_mem.c b/deps/openssl/openssl/crypto/bio/bss_mem.c index d190765dc2010e..b0394a960da15f 100644 --- a/deps/openssl/openssl/crypto/bio/bss_mem.c +++ b/deps/openssl/openssl/crypto/bio/bss_mem.c @@ -91,7 +91,8 @@ BIO_METHOD *BIO_s_mem(void) return (&mem_method); } -BIO *BIO_new_mem_buf(void *buf, int len) + +BIO *BIO_new_mem_buf(const void *buf, int len) { BIO *ret; BUF_MEM *b; @@ -105,7 +106,8 @@ BIO *BIO_new_mem_buf(void *buf, int len) if (!(ret = BIO_new(BIO_s_mem()))) return NULL; b = (BUF_MEM *)ret->ptr; - b->data = buf; + /* Cast away const and trust in the MEM_RDONLY flag. */ + b->data = (void *)buf; b->length = sz; b->max = sz; ret->flags |= BIO_FLAGS_MEM_RDONLY; diff --git a/deps/openssl/openssl/crypto/bn/Makefile b/deps/openssl/openssl/crypto/bn/Makefile index 215855ecae914c..c4c6409517596f 100644 --- a/deps/openssl/openssl/crypto/bn/Makefile +++ b/deps/openssl/openssl/crypto/bn/Makefile @@ -252,8 +252,8 @@ bn_exp.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h bn_exp.o: ../../include/openssl/lhash.h ../../include/openssl/opensslconf.h bn_exp.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h bn_exp.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h -bn_exp.o: ../../include/openssl/symhacks.h ../cryptlib.h bn_exp.c bn_lcl.h -bn_exp.o: rsaz_exp.h +bn_exp.o: ../../include/openssl/symhacks.h ../constant_time_locl.h +bn_exp.o: ../cryptlib.h bn_exp.c bn_lcl.h rsaz_exp.h bn_exp2.o: ../../e_os.h ../../include/openssl/bio.h ../../include/openssl/bn.h bn_exp2.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h bn_exp2.o: ../../include/openssl/e_os2.h ../../include/openssl/err.h diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl index 3b6ccf83d13e4b..712a77fe8ca3ab 100755 --- a/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl +++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-avx2.pl @@ -443,7 +443,7 @@ $TEMP3 = $Y1; $TEMP4 = $Y2; $code.=<<___; - #we need to fix indexes 32-39 to avoid overflow + # we need to fix indices 32-39 to avoid overflow vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) @@ -1592,68 +1592,128 @@ .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: + vzeroupper + mov %rsp,%r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax - vzeroupper .LSEH_begin_rsaz_1024_gather5: # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) ___ $code.=<<___; - lea .Lgather_table(%rip),%r11 - mov $power,%eax - and \$3,$power - shr \$2,%eax # cache line number - shl \$4,$power # offset within cache line - - vmovdqu -32(%r11),%ymm7 # .Lgather_permd - vpbroadcastb 8(%r11,%rax), %xmm8 - vpbroadcastb 7(%r11,%rax), %xmm9 - vpbroadcastb 6(%r11,%rax), %xmm10 - vpbroadcastb 5(%r11,%rax), %xmm11 - vpbroadcastb 4(%r11,%rax), %xmm12 - vpbroadcastb 3(%r11,%rax), %xmm13 - vpbroadcastb 2(%r11,%rax), %xmm14 - vpbroadcastb 1(%r11,%rax), %xmm15 - - lea 64($inp,$power),$inp - mov \$64,%r11 # size optimization - mov \$9,%eax - jmp .Loop_gather_1024 + lea -0x100(%rsp),%rsp + and \$-32, %rsp + lea .Linc(%rip), %r10 + lea -128(%rsp),%rax # control u-op density + + vmovd $power, %xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5, %ymm0, %ymm2 + vpcmpeqd %ymm4, %ymm0, %ymm0 + vpaddd %ymm5, %ymm1, %ymm3 + vpcmpeqd %ymm4, %ymm1, %ymm1 + vmovdqa %ymm0, 32*0+128(%rax) + vpaddd %ymm5, %ymm2, %ymm0 + vpcmpeqd %ymm4, %ymm2, %ymm2 + vmovdqa %ymm1, 32*1+128(%rax) + vpaddd %ymm5, %ymm3, %ymm1 + vpcmpeqd %ymm4, %ymm3, %ymm3 + vmovdqa %ymm2, 32*2+128(%rax) + vpaddd %ymm5, %ymm0, %ymm2 + vpcmpeqd %ymm4, %ymm0, %ymm0 + vmovdqa %ymm3, 32*3+128(%rax) + vpaddd %ymm5, %ymm1, %ymm3 + vpcmpeqd %ymm4, %ymm1, %ymm1 + vmovdqa %ymm0, 32*4+128(%rax) + vpaddd %ymm5, %ymm2, %ymm8 + vpcmpeqd %ymm4, %ymm2, %ymm2 + vmovdqa %ymm1, 32*5+128(%rax) + vpaddd %ymm5, %ymm3, %ymm9 + vpcmpeqd %ymm4, %ymm3, %ymm3 + vmovdqa %ymm2, 32*6+128(%rax) + vpaddd %ymm5, %ymm8, %ymm10 + vpcmpeqd %ymm4, %ymm8, %ymm8 + vmovdqa %ymm3, 32*7+128(%rax) + vpaddd %ymm5, %ymm9, %ymm11 + vpcmpeqd %ymm4, %ymm9, %ymm9 + vpaddd %ymm5, %ymm10, %ymm12 + vpcmpeqd %ymm4, %ymm10, %ymm10 + vpaddd %ymm5, %ymm11, %ymm13 + vpcmpeqd %ymm4, %ymm11, %ymm11 + vpaddd %ymm5, %ymm12, %ymm14 + vpcmpeqd %ymm4, %ymm12, %ymm12 + vpaddd %ymm5, %ymm13, %ymm15 + vpcmpeqd %ymm4, %ymm13, %ymm13 + vpcmpeqd %ymm4, %ymm14, %ymm14 + vpcmpeqd %ymm4, %ymm15, %ymm15 + + vmovdqa -32(%r10),%ymm7 # .Lgather_permd + lea 128($inp), $inp + mov \$9,$power -.align 32 .Loop_gather_1024: - vpand -64($inp), %xmm8,%xmm0 - vpand ($inp), %xmm9,%xmm1 - vpand 64($inp), %xmm10,%xmm2 - vpand ($inp,%r11,2), %xmm11,%xmm3 - vpor %xmm0,%xmm1,%xmm1 - vpand 64($inp,%r11,2), %xmm12,%xmm4 - vpor %xmm2,%xmm3,%xmm3 - vpand ($inp,%r11,4), %xmm13,%xmm5 - vpor %xmm1,%xmm3,%xmm3 - vpand 64($inp,%r11,4), %xmm14,%xmm6 - vpor %xmm4,%xmm5,%xmm5 - vpand -128($inp,%r11,8), %xmm15,%xmm2 - lea ($inp,%r11,8),$inp - vpor %xmm3,%xmm5,%xmm5 - vpor %xmm2,%xmm6,%xmm6 - vpor %xmm5,%xmm6,%xmm6 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqu %ymm6,($out) + vmovdqa 32*0-128($inp), %ymm0 + vmovdqa 32*1-128($inp), %ymm1 + vmovdqa 32*2-128($inp), %ymm2 + vmovdqa 32*3-128($inp), %ymm3 + vpand 32*0+128(%rax), %ymm0, %ymm0 + vpand 32*1+128(%rax), %ymm1, %ymm1 + vpand 32*2+128(%rax), %ymm2, %ymm2 + vpor %ymm0, %ymm1, %ymm4 + vpand 32*3+128(%rax), %ymm3, %ymm3 + vmovdqa 32*4-128($inp), %ymm0 + vmovdqa 32*5-128($inp), %ymm1 + vpor %ymm2, %ymm3, %ymm5 + vmovdqa 32*6-128($inp), %ymm2 + vmovdqa 32*7-128($inp), %ymm3 + vpand 32*4+128(%rax), %ymm0, %ymm0 + vpand 32*5+128(%rax), %ymm1, %ymm1 + vpand 32*6+128(%rax), %ymm2, %ymm2 + vpor %ymm0, %ymm4, %ymm4 + vpand 32*7+128(%rax), %ymm3, %ymm3 + vpand 32*8-128($inp), %ymm8, %ymm0 + vpor %ymm1, %ymm5, %ymm5 + vpand 32*9-128($inp), %ymm9, %ymm1 + vpor %ymm2, %ymm4, %ymm4 + vpand 32*10-128($inp),%ymm10, %ymm2 + vpor %ymm3, %ymm5, %ymm5 + vpand 32*11-128($inp),%ymm11, %ymm3 + vpor %ymm0, %ymm4, %ymm4 + vpand 32*12-128($inp),%ymm12, %ymm0 + vpor %ymm1, %ymm5, %ymm5 + vpand 32*13-128($inp),%ymm13, %ymm1 + vpor %ymm2, %ymm4, %ymm4 + vpand 32*14-128($inp),%ymm14, %ymm2 + vpor %ymm3, %ymm5, %ymm5 + vpand 32*15-128($inp),%ymm15, %ymm3 + lea 32*16($inp), $inp + vpor %ymm0, %ymm4, %ymm4 + vpor %ymm1, %ymm5, %ymm5 + vpor %ymm2, %ymm4, %ymm4 + vpor %ymm3, %ymm5, %ymm5 + + vpor %ymm5, %ymm4, %ymm4 + vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared + vpor %xmm4, %xmm5, %xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,($out) lea 32($out),$out - dec %eax + dec $power jnz .Loop_gather_1024 vpxor %ymm0,%ymm0,%ymm0 @@ -1661,20 +1721,20 @@ vzeroupper ___ $code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - movaps 0x20(%rsp),%xmm8 - movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - movaps 0x60(%rsp),%xmm12 - movaps 0x70(%rsp),%xmm13 - movaps 0x80(%rsp),%xmm14 - movaps 0x90(%rsp),%xmm15 - lea 0xa8(%rsp),%rsp + movaps -0xa8(%r11),%xmm6 + movaps -0x98(%r11),%xmm7 + movaps -0x88(%r11),%xmm8 + movaps -0x78(%r11),%xmm9 + movaps -0x68(%r11),%xmm10 + movaps -0x58(%r11),%xmm11 + movaps -0x48(%r11),%xmm12 + movaps -0x38(%r11),%xmm13 + movaps -0x28(%r11),%xmm14 + movaps -0x18(%r11),%xmm15 .LSEH_end_rsaz_1024_gather5: ___ $code.=<<___; + lea (%r11),%rsp ret .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ @@ -1708,8 +1768,10 @@ .long 0,2,4,6,7,7,7,7 .Lgather_permd: .long 0,7,1,7,2,7,3,7 -.Lgather_table: - .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 +.Linc: + .long 0,0,0,0, 1,1,1,1 + .long 2,2,2,2, 3,3,3,3 + .long 4,4,4,4, 4,4,4,4 .align 64 ___ @@ -1837,18 +1899,19 @@ .rva rsaz_se_handler .rva .Lmul_1024_body,.Lmul_1024_epilogue .LSEH_info_rsaz_1024_gather5: - .byte 0x01,0x33,0x16,0x00 - .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 - .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 - .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 - .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 - .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 - .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 - .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 - .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 - .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 - .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 - .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 ___ } diff --git a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl index 091cdc2069da4b..87ce2c34d90cbb 100755 --- a/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl +++ b/deps/openssl/openssl/crypto/bn/asm/rsaz-x86_64.pl @@ -915,9 +915,76 @@ push %r14 push %r15 - mov $pwr, $pwr - subq \$128+24, %rsp + subq \$`128+24+($win64?0xb0:0)`, %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0xa0(%rsp) + movaps %xmm7,0xb0(%rsp) + movaps %xmm8,0xc0(%rsp) + movaps %xmm9,0xd0(%rsp) + movaps %xmm10,0xe0(%rsp) + movaps %xmm11,0xf0(%rsp) + movaps %xmm12,0x100(%rsp) + movaps %xmm13,0x110(%rsp) + movaps %xmm14,0x120(%rsp) + movaps %xmm15,0x130(%rsp) +___ +$code.=<<___; .Lmul_gather4_body: + movd $pwr,%xmm8 + movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 + movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 + + pshufd \$0,%xmm8,%xmm8 # broadcast $power + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..15 to $power +# +for($i=0;$i<4;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` + movdqa %xmm7,%xmm`$i+3` +___ +} +for(;$i<7;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` +___ +} +$code.=<<___; + pcmpeqd %xmm8,%xmm7 + + movdqa 16*0($bp),%xmm8 + movdqa 16*1($bp),%xmm9 + movdqa 16*2($bp),%xmm10 + movdqa 16*3($bp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4($bp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5($bp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6($bp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7($bp),%xmm15 + leaq 128($bp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d @@ -926,45 +993,38 @@ je .Lmulx_gather ___ $code.=<<___; - movl 64($bp,$pwr,4), %eax - movq $out, %xmm0 # off-load arguments - movl ($bp,$pwr,4), %ebx - movq $mod, %xmm1 - movq $n0, 128(%rsp) + movq %xmm8,%rbx + + movq $n0, 128(%rsp) # off-load arguments + movq $out, 128+8(%rsp) + movq $mod, 128+16(%rsp) - shlq \$32, %rax - or %rax, %rbx movq ($ap), %rax movq 8($ap), %rcx - leaq 128($bp,$pwr,4), %rbp mulq %rbx # 0 iteration movq %rax, (%rsp) movq %rcx, %rax movq %rdx, %r8 mulq %rbx - movd (%rbp), %xmm4 addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx - movd 64(%rbp), %xmm5 addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx - pslldq \$4, %xmm5 addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx - por %xmm5, %xmm4 addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 @@ -977,14 +1037,12 @@ adcq \$0, %r13 mulq %rbx - leaq 128(%rbp), %rbp addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx - movq %xmm4, %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 @@ -996,6 +1054,35 @@ .align 32 .Loop_mul_gather: + movdqa 16*0(%rbp),%xmm8 + movdqa 16*1(%rbp),%xmm9 + movdqa 16*2(%rbp),%xmm10 + movdqa 16*3(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7(%rbp),%xmm15 + leaq 128(%rbp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,%rbx + mulq %rbx addq %rax, %r8 movq 8($ap), %rax @@ -1004,7 +1091,6 @@ adcq \$0, %r8 mulq %rbx - movd (%rbp), %xmm4 addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx @@ -1013,7 +1099,6 @@ adcq \$0, %r9 mulq %rbx - movd 64(%rbp), %xmm5 addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx @@ -1022,7 +1107,6 @@ adcq \$0, %r10 mulq %rbx - pslldq \$4, %xmm5 addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx @@ -1031,7 +1115,6 @@ adcq \$0, %r11 mulq %rbx - por %xmm5, %xmm4 addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx @@ -1056,7 +1139,6 @@ adcq \$0, %r14 mulq %rbx - movq %xmm4, %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx @@ -1064,7 +1146,6 @@ movq %rdx, %r15 adcq \$0, %r15 - leaq 128(%rbp), %rbp leaq 8(%rdi), %rdi decl %ecx @@ -1079,8 +1160,8 @@ movq %r14, 48(%rdi) movq %r15, 56(%rdi) - movq %xmm0, $out - movq %xmm1, %rbp + movq 128+8(%rsp), $out + movq 128+16(%rsp), %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 @@ -1098,45 +1179,37 @@ .align 32 .Lmulx_gather: - mov 64($bp,$pwr,4), %eax - movq $out, %xmm0 # off-load arguments - lea 128($bp,$pwr,4), %rbp - mov ($bp,$pwr,4), %edx - movq $mod, %xmm1 - mov $n0, 128(%rsp) + movq %xmm8,%rdx + + mov $n0, 128(%rsp) # off-load arguments + mov $out, 128+8(%rsp) + mov $mod, 128+16(%rsp) - shl \$32, %rax - or %rax, %rdx mulx ($ap), %rbx, %r8 # 0 iteration mov %rbx, (%rsp) xor %edi, %edi # cf=0, of=0 mulx 8($ap), %rax, %r9 - movd (%rbp), %xmm4 mulx 16($ap), %rbx, %r10 - movd 64(%rbp), %xmm5 adcx %rax, %r8 mulx 24($ap), %rax, %r11 - pslldq \$4, %xmm5 adcx %rbx, %r9 mulx 32($ap), %rbx, %r12 - por %xmm5, %xmm4 adcx %rax, %r10 mulx 40($ap), %rax, %r13 adcx %rbx, %r11 mulx 48($ap), %rbx, %r14 - lea 128(%rbp), %rbp adcx %rax, %r12 mulx 56($ap), %rax, %r15 - movq %xmm4, %rdx adcx %rbx, %r13 adcx %rax, %r14 + .byte 0x67 mov %r8, %rbx adcx %rdi, %r15 # %rdi is 0 @@ -1145,24 +1218,48 @@ .align 32 .Loop_mulx_gather: - mulx ($ap), %rax, %r8 + movdqa 16*0(%rbp),%xmm8 + movdqa 16*1(%rbp),%xmm9 + movdqa 16*2(%rbp),%xmm10 + movdqa 16*3(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7(%rbp),%xmm15 + leaq 128(%rbp), %rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,%rdx + + .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 - .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 - movd 64(%rbp), %xmm5 - lea 128(%rbp), %rbp adcx %rax, %r9 adox %r11, %r10 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 - pslldq \$4, %xmm5 - por %xmm5, %xmm4 adcx %rax, %r10 adox %r12, %r11 @@ -1176,10 +1273,10 @@ .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 + .byte 0x67 adox %r15, %r14 mulx 56($ap), %rax, %r15 - movq %xmm4, %rdx mov %rbx, 64(%rsp,%rcx,8) adcx %rax, %r14 adox %rdi, %r15 @@ -1198,10 +1295,10 @@ mov %r14, 64+48(%rsp) mov %r15, 64+56(%rsp) - movq %xmm0, $out - movq %xmm1, %rbp + mov 128(%rsp), %rdx # pull arguments + mov 128+8(%rsp), $out + mov 128+16(%rsp), %rbp - mov 128(%rsp), %rdx # pull $n0 mov (%rsp), %r8 mov 8(%rsp), %r9 mov 16(%rsp), %r10 @@ -1229,6 +1326,21 @@ call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax +___ +$code.=<<___ if ($win64); + movaps 0xa0-0xc8(%rax),%xmm6 + movaps 0xb0-0xc8(%rax),%xmm7 + movaps 0xc0-0xc8(%rax),%xmm8 + movaps 0xd0-0xc8(%rax),%xmm9 + movaps 0xe0-0xc8(%rax),%xmm10 + movaps 0xf0-0xc8(%rax),%xmm11 + movaps 0x100-0xc8(%rax),%xmm12 + movaps 0x110-0xc8(%rax),%xmm13 + movaps 0x120-0xc8(%rax),%xmm14 + movaps 0x130-0xc8(%rax),%xmm15 + lea 0xb0(%rax),%rax +___ +$code.=<<___; movq -48(%rax), %r15 movq -40(%rax), %r14 movq -32(%rax), %r13 @@ -1258,7 +1370,7 @@ mov $pwr, $pwr subq \$128+24, %rsp .Lmul_scatter4_body: - leaq ($tbl,$pwr,4), $tbl + leaq ($tbl,$pwr,8), $tbl movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $tbl, %xmm2 @@ -1329,30 +1441,14 @@ call __rsaz_512_subtract - movl %r8d, 64*0($inp) # scatter - shrq \$32, %r8 - movl %r9d, 64*2($inp) - shrq \$32, %r9 - movl %r10d, 64*4($inp) - shrq \$32, %r10 - movl %r11d, 64*6($inp) - shrq \$32, %r11 - movl %r12d, 64*8($inp) - shrq \$32, %r12 - movl %r13d, 64*10($inp) - shrq \$32, %r13 - movl %r14d, 64*12($inp) - shrq \$32, %r14 - movl %r15d, 64*14($inp) - shrq \$32, %r15 - movl %r8d, 64*1($inp) - movl %r9d, 64*3($inp) - movl %r10d, 64*5($inp) - movl %r11d, 64*7($inp) - movl %r12d, 64*9($inp) - movl %r13d, 64*11($inp) - movl %r14d, 64*13($inp) - movl %r15d, 64*15($inp) + movq %r8, 128*0($inp) # scatter + movq %r9, 128*1($inp) + movq %r10, 128*2($inp) + movq %r11, 128*3($inp) + movq %r12, 128*4($inp) + movq %r13, 128*5($inp) + movq %r14, 128*6($inp) + movq %r15, 128*7($inp) leaq 128+24+48(%rsp), %rax movq -48(%rax), %r15 @@ -1956,16 +2052,14 @@ .type rsaz_512_scatter4,\@abi-omnipotent .align 16 rsaz_512_scatter4: - leaq ($out,$power,4), $out + leaq ($out,$power,8), $out movl \$8, %r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq ($inp), %rax leaq 8($inp), $inp - movl %eax, ($out) - shrq \$32, %rax - movl %eax, 64($out) + movq %rax, ($out) leaq 128($out), $out decl %r9d jnz .Loop_scatter @@ -1976,22 +2070,106 @@ .type rsaz_512_gather4,\@abi-omnipotent .align 16 rsaz_512_gather4: - leaq ($inp,$power,4), $inp +___ +$code.=<<___ if ($win64); +.LSEH_begin_rsaz_512_gather4: + .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp + .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) + .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) + .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) + .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) + .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) + .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) + .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) + .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) + .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) +___ +$code.=<<___; + movd $power,%xmm8 + movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 + movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 + + pshufd \$0,%xmm8,%xmm8 # broadcast $power + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..15 to $power +# +for($i=0;$i<4;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` + movdqa %xmm7,%xmm`$i+3` +___ +} +for(;$i<7;$i++) { +$code.=<<___; + paddd %xmm`$i`,%xmm`$i+1` + pcmpeqd %xmm8,%xmm`$i` +___ +} +$code.=<<___; + pcmpeqd %xmm8,%xmm7 movl \$8, %r9d jmp .Loop_gather .align 16 .Loop_gather: - movl ($inp), %eax - movl 64($inp), %r8d + movdqa 16*0($inp),%xmm8 + movdqa 16*1($inp),%xmm9 + movdqa 16*2($inp),%xmm10 + movdqa 16*3($inp),%xmm11 + pand %xmm0,%xmm8 + movdqa 16*4($inp),%xmm12 + pand %xmm1,%xmm9 + movdqa 16*5($inp),%xmm13 + pand %xmm2,%xmm10 + movdqa 16*6($inp),%xmm14 + pand %xmm3,%xmm11 + movdqa 16*7($inp),%xmm15 leaq 128($inp), $inp - shlq \$32, %r8 - or %r8, %rax - movq %rax, ($out) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd \$0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,($out) leaq 8($out), $out decl %r9d jnz .Loop_gather +___ +$code.=<<___ if ($win64); + movaps 0x00(%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + add \$0xa8,%rsp +___ +$code.=<<___; ret +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 ___ } @@ -2039,6 +2217,18 @@ lea 128+24+48(%rax),%rax + lea .Lmul_gather4_epilogue(%rip),%rbx + cmp %r10,%rbx + jne .Lse_not_in_mul_gather4 + + lea 0xb0(%rax),%rax + + lea -48-0xa8(%rax),%rsi + lea 512($context),%rdi + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lse_not_in_mul_gather4: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -2090,7 +2280,7 @@ pop %rdi pop %rsi ret -.size sqr_handler,.-sqr_handler +.size se_handler,.-se_handler .section .pdata .align 4 @@ -2114,6 +2304,10 @@ .rva .LSEH_end_rsaz_512_mul_by_one .rva .LSEH_info_rsaz_512_mul_by_one + .rva .LSEH_begin_rsaz_512_gather4 + .rva .LSEH_end_rsaz_512_gather4 + .rva .LSEH_info_rsaz_512_gather4 + .section .xdata .align 8 .LSEH_info_rsaz_512_sqr: @@ -2136,6 +2330,19 @@ .byte 9,0,0,0 .rva se_handler .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] +.LSEH_info_rsaz_512_gather4: + .byte 0x01,0x46,0x16,0x00 + .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 ___ } diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl index e82e451388c75c..29ba1224e36b5e 100755 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont.pl @@ -775,100 +775,126 @@ # 4096. this is done to allow memory disambiguation logic # do its job. # - lea -64(%rsp,$num,4),%r11 + lea -64(%rsp,$num,2),%r11 mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num - lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num) + lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num + lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rsp .Lsqr8x_sp_done: and \$-64,%rsp - mov $num,%r10 + mov $num,%r10 neg $num - lea 64(%rsp,$num,2),%r11 # copy of modulus mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lsqr8x_body: - mov $num,$i - movq %r11, %xmm2 # save pointer to modulus copy - shr \$3+2,$i - mov OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 8*0($nptr),%xmm0 - movq 8*1($nptr),%xmm1 - movq 8*2($nptr),%xmm3 - movq 8*3($nptr),%xmm4 - lea 8*4($nptr),$nptr - movdqa %xmm0,16*0(%r11) - movdqa %xmm1,16*1(%r11) - movdqa %xmm3,16*2(%r11) - movdqa %xmm4,16*3(%r11) - lea 16*4(%r11),%r11 - dec $i - jnz .Lsqr8x_copy_n - + movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%eax and \$0x80100,%eax cmp \$0x80100,%eax jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module - - pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx - shr \$3+2,$num - mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + # %rax top-most carry + # %rbp nptr + # %rcx -8*num + # %r8 end of tp[2*num] + lea (%r8,%rcx),%rbx + mov %rcx,$num + mov %rcx,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module + # %rax top-most carry + # %rbp nptr + # %r8 -8*num + # %rdi end of tp[2*num] + lea (%rdi,$num),%rbx + mov $num,%rcx + mov $num,%rdx + movq %xmm1,$rptr + sar \$3+2,%rcx # %cf=0 + jmp .Lsqr8x_sub +.align 32 +.Lsqr8x_sub: + mov 8*0(%rbx),%r12 + mov 8*1(%rbx),%r13 + mov 8*2(%rbx),%r14 + mov 8*3(%rbx),%r15 + lea 8*4(%rbx),%rbx + sbb 8*0(%rbp),%r12 + sbb 8*1(%rbp),%r13 + sbb 8*2(%rbp),%r14 + sbb 8*3(%rbp),%r15 + lea 8*4(%rbp),%rbp + mov %r12,8*0($rptr) + mov %r13,8*1($rptr) + mov %r14,8*2($rptr) + mov %r15,8*3($rptr) + lea 8*4($rptr),$rptr + inc %rcx # preserves %cf + jnz .Lsqr8x_sub + + sbb \$0,%rax # top-most carry + lea (%rbx,$num),%rbx # rewind + lea ($rptr,$num),$rptr # rewind + + movq %rax,%xmm1 pxor %xmm0,%xmm0 - lea 48(%rsp),%rax - lea 64(%rsp,$num,2),%rdx - shr \$3+2,$num + pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp - jmp .Lsqr8x_zero + jmp .Lsqr8x_cond_copy .align 32 -.Lsqr8x_zero: - movdqa %xmm0,16*0(%rax) # wipe t - movdqa %xmm0,16*1(%rax) - movdqa %xmm0,16*2(%rax) - movdqa %xmm0,16*3(%rax) - lea 16*4(%rax),%rax - movdqa %xmm0,16*0(%rdx) # wipe n - movdqa %xmm0,16*1(%rdx) - movdqa %xmm0,16*2(%rdx) - movdqa %xmm0,16*3(%rdx) - lea 16*4(%rdx),%rdx - dec $num - jnz .Lsqr8x_zero +.Lsqr8x_cond_copy: + movdqa 16*0(%rbx),%xmm2 + movdqa 16*1(%rbx),%xmm3 + lea 16*2(%rbx),%rbx + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2(%rbx) # zero tp + movdqa %xmm0,-16*1(%rbx) + movdqa %xmm0,-16*2(%rbx,%rdx) + movdqa %xmm0,-16*1(%rbx,%rdx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + add \$32,$num + jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 @@ -1135,64 +1161,75 @@ adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 - mov -8($nptr),$mi sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer - sub %r14,$mi # compare top-most words - sbb $mi,$mi - or $mi,%r15 - - neg $num - xor %rdx,%rdx + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + neg %r15 + mov $num,%rdx + shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + mov 8*0($tptr),%r11 + mov 8*1($tptr),%r12 + mov 8*2($tptr),%r13 + mov 8*3($tptr),%r14 + lea 8*4($tptr),$tptr + sbb 8*0($nptr),%r11 + sbb 8*1($nptr),%r12 + sbb 8*2($nptr),%r13 + sbb 8*3($nptr),%r14 + lea 8*4($nptr),$nptr + mov %r11,8*0($rptr) + mov %r12,8*1($rptr) + mov %r13,8*2($rptr) + mov %r14,8*3($rptr) + lea 8*4($rptr),$rptr + dec $num # preserves %cf + jnz .Lmulx4x_sub + + sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr + sub %rdx,$rptr # rewind + movq %r15,%xmm1 pxor %xmm0,%xmm0 - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - neg %r8 - jmp .Lmulx4x_sub_entry + pshufd \$0,%xmm1,%xmm1 + mov 40(%rsp),%rsi # restore %rsp + jmp .Lmulx4x_cond_copy .align 32 -.Lmulx4x_sub: - mov 0*8($nptr,$num),%r8 - mov 1*8($nptr,$num),%r9 - not %r8 -.Lmulx4x_sub_entry: - mov 2*8($nptr,$num),%r10 - not %r9 - and %r15,%r8 - mov 3*8($nptr,$num),%r11 - not %r10 - and %r15,%r9 - not %r11 - and %r15,%r10 - and %r15,%r11 - - neg %rdx # mov %rdx,%cf - adc 0*8($tptr),%r8 - adc 1*8($tptr),%r9 - movdqa %xmm0,($tptr) - adc 2*8($tptr),%r10 - adc 3*8($tptr),%r11 - movdqa %xmm0,16($tptr) - lea 4*8($tptr),$tptr - sbb %rdx,%rdx # mov %cf,%rdx +.Lmulx4x_cond_copy: + movdqa 16*0($tptr),%xmm2 + movdqa 16*1($tptr),%xmm3 + lea 16*2($tptr),$tptr + movdqu 16*0($rptr),%xmm4 + movdqu 16*1($rptr),%xmm5 + lea 16*2($rptr),$rptr + movdqa %xmm0,-16*2($tptr) # zero tp + movdqa %xmm0,-16*1($tptr) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-16*2($rptr) + movdqu %xmm5,-16*1($rptr) + sub \$32,%rdx + jnz .Lmulx4x_cond_copy - mov %r8,0*8($rptr) - mov %r9,1*8($rptr) - mov %r10,2*8($rptr) - mov %r11,3*8($rptr) - lea 4*8($rptr),$rptr + mov %rdx,($tptr) - add \$32,$num - jnz .Lmulx4x_sub - - mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax mov -48(%rsi),%r15 mov -40(%rsi),%r14 diff --git a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl index 292409c4ffb8b1..2e8c9db32cbc13 100755 --- a/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl +++ b/deps/openssl/openssl/crypto/bn/asm/x86_64-mont5.pl @@ -99,58 +99,111 @@ .Lmul_enter: mov ${num}d,${num}d mov %rsp,%rax - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument + lea .Linc(%rip),%r10 push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; + lea 2($num),%r11 neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) + lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8) and \$-1024,%rsp # minimize TLB usage mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: - mov $bp,%r12 # reassign $bp + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 + lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) + and \$-16,%r10 + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($k+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($k+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($k+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($k+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($k+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($k+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($k+2)+112`(%r10) + pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($k+1)-128`($bp),%xmm1 + pand `16*($k+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($k+3)+112`(%r10) + pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm4 + movdqa `16*($k+1)-128`($bp),%xmm5 + movdqa `16*($k+2)-128`($bp),%xmm2 + pand `16*($k+0)+112`(%r10),%xmm4 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($k+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($k+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value @@ -159,29 +212,14 @@ xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -212,16 +250,14 @@ mulq $m1 # np[j]*m1 cmp $num,$j - jne .L1st - - movq %xmm0,$m0 # bp[1] + jne .L1st # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 @@ -235,33 +271,48 @@ jmp .Louter .align 16 .Louter: + lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) + and \$-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($k=0;$k<$STRIDE/16;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm0 + movdqa `16*($k+1)-128`($bp),%xmm1 + movdqa `16*($k+2)-128`($bp),%xmm2 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+0)-128`(%rdx),%xmm0 + pand `16*($k+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($k+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($k+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + + mov ($ap),%rax # ap[0] + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -295,17 +346,14 @@ mulq $m1 # np[j]*m1 cmp $num,$j - jne .Linner - - movq %xmm0,$m0 # bp[i+1] - + jne .Linner # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] - mov (%rsp,$j,8),$lo0 + mov (%rsp,$num,8),$lo0 adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx @@ -352,12 +400,7 @@ mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -379,8 +422,8 @@ .Lmul4x_enter: ___ $code.=<<___ if ($addx); - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; @@ -392,39 +435,34 @@ push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; + .byte 0x67 - mov ${num}d,%r10d - shl \$3,${num}d - shl \$3+2,%r10d # 4*$num + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $ap - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r11,%rsp # align with $rp + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - lea 4096-64(,$num,2),%r10 - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -440,12 +478,7 @@ mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -460,9 +493,10 @@ .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: - shl \$5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument - lea 256(%rdx,$num),%r13 + shl \$5,$num # $num was in bytes + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index + lea .Linc(%rip),%rax + lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; @@ -470,44 +504,92 @@ $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - lea $STRIDE($bp),$tp # borrow $tp - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - .byte 0x67 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tp),%xmm1 - .byte 0x67 - pand %xmm7,%xmm3 - .byte 0x67 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tp),%xmm2 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128(%rdx),$bp # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq `2*$STRIDE/4-96`($tp),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($i+1)-128`($bp),%xmm1 + pand `16*($i+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm4 + movdqa `16*($i+1)-128`($bp),%xmm5 + movdqa `16*($i+2)-128`($bp),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] - movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp @@ -521,26 +603,10 @@ mov %rax,$A[0] mov ($np),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq $A[0],$m1 # "tp[0]"*n0 - ############################################################## - # $tp is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8(%rsp,%r11,8),$tp + lea 64+8(%rsp),$tp mov %rdx,$A[1] - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - lea 2*$STRIDE($bp),$bp - por %xmm1,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax @@ -549,7 +615,7 @@ mulq $m0 add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -559,7 +625,7 @@ adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] @@ -569,7 +635,7 @@ .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -585,7 +651,7 @@ mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -600,7 +666,7 @@ mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] @@ -615,7 +681,7 @@ mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -624,7 +690,7 @@ mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] @@ -634,7 +700,7 @@ mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -650,7 +716,7 @@ mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -663,8 +729,7 @@ mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -675,6 +740,33 @@ .align 32 .Louter4x: + lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm0 + movdqa `16*($i+1)-128`($bp),%xmm1 + movdqa `16*($i+2)-128`($bp),%xmm2 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+0)-128`(%rdx),%xmm0 + pand `16*($i+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] @@ -682,25 +774,11 @@ mov ($np),%rax adc \$0,%rdx - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - imulq $A[0],$m1 # tp[0]*n0 - .byte 0x67 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 lea ($tp,$num),$tp # rewind $tp - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded @@ -710,7 +788,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx @@ -722,7 +800,7 @@ adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x @@ -731,7 +809,7 @@ .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -749,7 +827,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -766,7 +844,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx @@ -783,7 +861,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx @@ -794,7 +872,7 @@ mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] @@ -804,7 +882,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -823,7 +901,7 @@ mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax - mov -16*1($np),$m1 + mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -838,9 +916,8 @@ mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[i+1] mov $N[1],-16($tp) # tp[j-1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -854,16 +931,23 @@ ___ if (1) { $code.=<<___; + xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] - xor \$1,$N[1] + sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub - lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov ($np),%r12 + lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx - sar \$3+2,%rcx # cf=0 + sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub - jmp .Lsqr4x_sub + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1(%rbp),%r13 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); @@ -930,8 +1014,8 @@ ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; @@ -942,38 +1026,32 @@ push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -995,16 +1073,21 @@ mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .Lpower5_body: - movq $rptr,%xmm1 # save $rptr + movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr - movq %r10, %xmm3 # -$num + movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr @@ -1565,9 +1648,9 @@ $code.=<<___; movq %xmm2,$nptr -sqr8x_reduction: +__bn_sqr8x_reduction: xor %rax,%rax - lea ($nptr,$num,2),%rcx # end of n[] + lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window @@ -1593,21 +1676,21 @@ .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 - mov 16*1($nptr),%rax # n[1] + mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] @@ -1616,7 +1699,7 @@ mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry @@ -1625,7 +1708,7 @@ mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 @@ -1634,7 +1717,7 @@ mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1642,7 +1725,7 @@ mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1650,7 +1733,7 @@ mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1659,7 +1742,7 @@ mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 @@ -1668,7 +1751,7 @@ dec %ecx jnz .L8x_reduce - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? @@ -1687,21 +1770,21 @@ mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx - mov 16*0($nptr),%rax + mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 - mov 16*1($nptr),%rax + mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ @@ -1710,7 +1793,7 @@ mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 @@ -1718,7 +1801,7 @@ mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 @@ -1726,7 +1809,7 @@ mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1734,7 +1817,7 @@ mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1742,7 +1825,7 @@ mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1753,14 +1836,14 @@ add %rax,%r15 adc \$0,%rdx add %r15,%r14 - mov 16*0($nptr),%rax # pull n[0] + mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop @@ -1806,7 +1889,7 @@ adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry - mov -16($nptr),%rcx # np[num-1] + mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr @@ -1824,6 +1907,8 @@ cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop + ret +.size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## @@ -1832,48 +1917,62 @@ { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; - #xor %rsi,%rsi # %rsi was $carry above - sub %r15,%rcx # compare top-most words +.type __bn_post4x_internal,\@abi-omnipotent +.align 32 +__bn_post4x_internal: + mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above - adc %rsi,%rsi mov $num,%rcx - or %rsi,%rax movq %xmm1,$rptr # restore $rptr - xor \$1,%rax + neg %rax movq %xmm1,$aptr # prepare for back-to-back call - lea ($nptr,%rax,8),$nptr - sar \$3+2,%rcx # cf=0 - jmp .Lsqr4x_sub + sar \$3+2,%rcx + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqr4x_sub_entry: + lea 8*4($nptr),$nptr + not %r12 + not %r13 + not %r14 + not %r15 + and %rax,%r12 + and %rax,%r13 + and %rax,%r14 + and %rax,%r15 + + neg %r10 # mov %r10,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub -___ -} -$code.=<<___; + mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal ___ +} { $code.=<<___; .globl bn_from_montgomery @@ -1897,39 +1996,32 @@ push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). The stack is allocated to aligned with + # bn_power5's frame, and as bn_from_montgomery happens to be + # last operation, we use the opportunity to cleanse it. # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -1983,12 +2075,13 @@ ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr - call sqrx8x_reduction + call __bn_sqrx8x_reduction + call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax @@ -1999,7 +2092,8 @@ .Lfrom_mont_nox: ___ $code.=<<___; - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax @@ -2039,7 +2133,6 @@ .align 32 bn_mulx4x_mont_gather5: .Lmulx4x_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2047,40 +2140,33 @@ push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers a[num], ret[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done -.align 32 .Lmulx4xsp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2106,12 +2192,7 @@ mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2126,14 +2207,16 @@ .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: - .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num - .byte 0x67 + mov $num,8(%rsp) # save -$num (it was in bytes) + mov $num,%r10 neg $num # restore $num shl \$5,$num - lea 256($bp,$num),%r13 + neg %r10 # restore $num + lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num + lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp @@ -2144,52 +2227,92 @@ my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bptr # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bptr),%xmm0 - lea $STRIDE($bptr),$tptr # borrow $tptr - movq `1*$STRIDE/4-96`($bptr),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tptr),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tptr),%xmm2 - por %xmm3,%xmm0 - .byte 0x67,0x67 - pand %xmm4,%xmm1 - movq `2*$STRIDE/4-96`($tptr),%xmm3 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 128($bp),$bptr # size optimization + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + .byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + .byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + + pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register + pand `16*($i+1)-128`($bptr),%xmm1 + pand `16*($i+2)-128`($bptr),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bptr),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm4 + movdqa `16*($i+1)-128`($bptr),%xmm5 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + pxor %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] - movq `3*$STRIDE/4-96`($tptr),%xmm0 - lea 2*$STRIDE($bptr),$bptr # next &b[i] - pand %xmm5,%xmm2 - .byte 0x67,0x67 - pand %xmm6,%xmm3 - ############################################################## - # $tptr is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8*4+8(%rsp,%r11,8),$tptr + lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] @@ -2205,37 +2328,31 @@ xor $zero,$zero # cf=0, of=0 mov $mi,%rdx - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 mov $bptr,8+8(%rsp) # off-load &b[i] - por %xmm1,%xmm0 - .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value - .byte 0x66 mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 - .byte 0x67,0x67 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 - .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) - #jmp .Lmulx4x_1st + jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: @@ -2255,30 +2372,29 @@ lea 4*8($tptr),$tptr adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[1] adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 @@ -2289,6 +2405,34 @@ .align 32 .Lmulx4x_outer: + lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) + pxor %xmm4,%xmm4 + .byte 0x67,0x67 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm0 + movdqa `16*($i+1)-128`($bptr),%xmm1 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+256`(%r10),%xmm0 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+256`(%r10),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)+256`(%r10),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)+256`(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # m0=bp[i] + mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] @@ -2303,54 +2447,37 @@ mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 - lea ($nptr,$num,2),$nptr # rewind $nptr + lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 - .byte 0x67 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 - movq `0*$STRIDE/4-96`($bptr),%xmm0 - .byte 0x67,0x67 mov $mi,%rdx - movq `1*$STRIDE/4-96`($bptr),%xmm1 - .byte 0x67 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - .byte 0x67 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - add \$$STRIDE,$bptr # next &b[i] - .byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - por %xmm2,%xmm0 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) - por %xmm3,%xmm0 adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 @@ -2375,20 +2502,20 @@ adcx $zero,%r14 # cf=0 adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 @@ -2398,7 +2525,6 @@ jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[i+1] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] @@ -2411,20 +2537,26 @@ cmp %r10,$bptr jb .Lmulx4x_outer - mov -16($nptr),%r10 + mov -8($nptr),%r10 + mov $zero,%r8 + mov ($nptr,$num),%r12 + lea ($nptr,$num),%rbp # rewind $nptr + mov $num,%rcx + lea ($tptr,$num),%rdi # rewind $tptr + xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 - or %r15,$zero - xor \$1,$zero - lea ($tptr,$num),%rdi # rewind $tptr - lea ($nptr,$num,2),$nptr # rewind $nptr - .byte 0x67,0x67 - sar \$3+2,$num # cf=0 - lea ($nptr,$zero,8),%rbp + or %r15,%r8 + sar \$3+2,%rcx + sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp - mov $num,%rcx - jmp .Lsqrx4x_sub # common post-condition + dec %r12 # so that after 'not' we get -n[0] + mov 8*1(%rbp),%r13 + xor %r8,%r8 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqrx4x_sub_entry # common post-condition .size mulx4x_internal,.-mulx4x_internal ___ } { @@ -2448,7 +2580,6 @@ .align 32 bn_powerx5: .Lpowerx5_enter: - .byte 0x67 mov %rsp,%rax push %rbx push %rbp @@ -2456,39 +2587,32 @@ push %r13 push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 @@ -2519,10 +2643,15 @@ .Lpowerx5_body: call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr @@ -2534,12 +2663,7 @@ mov 40(%rsp),%rsi # restore %rsp mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 mov -40(%rsi),%r14 mov -32(%rsi),%r13 @@ -2973,11 +3097,11 @@ $code.=<<___; movq %xmm2,$nptr -sqrx8x_reduction: +__bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) - lea -128($nptr,$num,2),%rcx # end of n[] + lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] @@ -3006,23 +3130,23 @@ .align 32 .Lsqrx8x_reduce: mov %r8, %rbx - mulx 16*0($nptr),%rax,%r8 # n[0] + mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 - mulx 16*1($nptr),%rbx,%r9 # n[1] + mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rbx,%r10 + mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rbx,%r11 + mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 @@ -3032,15 +3156,15 @@ mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 @@ -3056,7 +3180,7 @@ mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 @@ -3075,35 +3199,35 @@ .align 32 .Lsqrx8x_tail: mov %r8,%rbx - mulx 16*0($nptr),%rax,%r8 + mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 - mulx 16*1($nptr),%rax,%r9 + mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rax,%r10 + mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rax,%r11 + mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 @@ -3119,7 +3243,7 @@ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 @@ -3155,7 +3279,7 @@ adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 - mov 16*7($nptr),$carry + mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 @@ -3181,6 +3305,8 @@ lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop + ret +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## @@ -3188,52 +3314,59 @@ # { my ($rptr,$nptr)=("%rdx","%rbp"); -my @ri=map("%r$_",(10..13)); -my @ni=map("%r$_",(14..15)); $code.=<<___; - xor %ebx,%ebx - sub %r15,%rsi # compare top-most words - adc %rbx,%rbx +.align 32 +__bn_postx4x_internal: + mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num - or %rbx,%rax mov %rcx,%r9 # -$num - xor \$1,%rax - sar \$3+2,%rcx # cf=0 + neg %rax + sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr - lea ($nptr,%rax,8),$nptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call - jmp .Lsqrx4x_sub + dec %r12 # so that after 'not' we get -n[0] + mov 8*1($nptr),%r13 + xor %r8,%r8 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqrx4x_sub_entry -.align 32 +.align 16 .Lsqrx4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqrx4x_sub_entry: + andn %rax,%r12,%r12 + lea 8*4($nptr),$nptr + andn %rax,%r13,%r13 + andn %rax,%r14,%r14 + andn %rax,%r15,%r15 + + neg %r8 # mov %r8,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub -___ -} -$code.=<<___; + neg %r9 # restore $num ret -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.size __bn_postx4x_internal,.-__bn_postx4x_internal ___ +} }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order @@ -3282,56 +3415,91 @@ .globl bn_gather5 .type bn_gather5,\@abi-omnipotent -.align 16 +.align 32 bn_gather5: -___ -$code.=<<___ if ($win64); -.LSEH_begin_bn_gather5: +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 ___ +######################################################################## +# calculate mask by comparing 0..31 to $idx and save result to stack +# +for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; - mov $idx,%r11d - shr \$`log($N/8)/log(2)`,$idx - and \$`$N/8-1`,%r11 - not $idx - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line - movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which - movq 8(%rax,$idx,8),%xmm5 # cache line contains element - movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument - movq 24(%rax,$idx,8),%xmm7 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) +___ +$code.=<<___; + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather -.align 16 -.Lgather: - movq `0*$STRIDE/4-128`($tbl),%xmm0 - movq `1*$STRIDE/4-128`($tbl),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-128`($tbl),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-128`($tbl),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - .byte 0x67,0x67 - por %xmm2,%xmm0 - lea $STRIDE($tbl),$tbl - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - lea 0x28(%rsp),%rsp -___ -$code.=<<___; + + lea (%r10),%rsp ret .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 @@ -3339,9 +3507,9 @@ } $code.=<<___; .align 64 -.Lmagic_masks: - .long 0,0, 0,0, 0,0, -1,-1 - .long 0,0, 0,0, 0,0, 0,0 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ @@ -3389,19 +3557,16 @@ lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx - jb .Lbody_40 + ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer + jmp .Lbody_proceed .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lbody_proceed: - - movaps -88(%rax),%xmm0 - movaps -72(%rax),%xmm1 - mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3414,8 +3579,6 @@ mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 - movups %xmm0,512($context) # restore context->Xmm6 - movups %xmm1,528($context) # restore context->Xmm7 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -3526,10 +3689,9 @@ $code.=<<___; .align 8 .LSEH_info_bn_gather5: - .byte 0x01,0x0d,0x05,0x00 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 + .byte 0x01,0x0b,0x03,0x0a + .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 + .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } diff --git a/deps/openssl/openssl/crypto/bn/bn.h b/deps/openssl/openssl/crypto/bn/bn.h index 5696965e9a09d0..86264ae6315fb1 100644 --- a/deps/openssl/openssl/crypto/bn/bn.h +++ b/deps/openssl/openssl/crypto/bn/bn.h @@ -125,6 +125,7 @@ #ifndef HEADER_BN_H # define HEADER_BN_H +# include # include # ifndef OPENSSL_NO_FP_API # include /* FILE */ @@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void); /* library internal functions */ -# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\ - (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2)) +# define bn_expand(a,bits) \ + ( \ + bits > (INT_MAX - BN_BITS2 + 1) ? \ + NULL \ + : \ + (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \ + (a) \ + : \ + bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \ + ) + # define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words))) BIGNUM *bn_expand2(BIGNUM *a, int words); # ifndef OPENSSL_NO_DEPRECATED diff --git a/deps/openssl/openssl/crypto/bn/bn_exp.c b/deps/openssl/openssl/crypto/bn/bn_exp.c index 6d30d1e0fff56e..1670f01d1d8c44 100644 --- a/deps/openssl/openssl/crypto/bn/bn_exp.c +++ b/deps/openssl/openssl/crypto/bn/bn_exp.c @@ -110,6 +110,7 @@ */ #include "cryptlib.h" +#include "constant_time_locl.h" #include "bn_lcl.h" #include @@ -606,15 +607,17 @@ static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos) static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, - int width) + int window) { - size_t i, j; + int i, j; + int width = 1 << window; + BN_ULONG *table = (BN_ULONG *)buf; if (top > b->top) top = b->top; /* this works because 'buf' is explicitly * zeroed */ - for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { - buf[j] = ((unsigned char *)b->d)[i]; + for (i = 0, j = idx; i < top; i++, j += width) { + table[j] = b->d[i]; } return 1; @@ -622,15 +625,51 @@ static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, - int width) + int window) { - size_t i, j; + int i, j; + int width = 1 << window; + volatile BN_ULONG *table = (volatile BN_ULONG *)buf; if (bn_wexpand(b, top) == NULL) return 0; - for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) { - ((unsigned char *)b->d)[i] = buf[j]; + if (window <= 3) { + for (i = 0; i < top; i++, table += width) { + BN_ULONG acc = 0; + + for (j = 0; j < width; j++) { + acc |= table[j] & + ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } + } else { + int xstride = 1 << (window - 2); + BN_ULONG y0, y1, y2, y3; + + i = idx >> (window - 2); /* equivalent of idx / xstride */ + idx &= xstride - 1; /* equivalent of idx % xstride */ + + y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1); + y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1); + y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1); + y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1); + + for (i = 0; i < top; i++, table += width) { + BN_ULONG acc = 0; + + for (j = 0; j < xstride; j++) { + acc |= ( (table[j + 0 * xstride] & y0) | + (table[j + 1 * xstride] & y1) | + (table[j + 2 * xstride] & y2) | + (table[j + 3 * xstride] & y3) ) + & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1)); + } + + b->d[i] = acc; + } } b->top = top; @@ -749,8 +788,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (window >= 5) { window = 5; /* ~5% improvement for RSA2048 sign, and even * for RSA4096 */ - if ((top & 7) == 0) - powerbufLen += 2 * top * sizeof(m->d[0]); + /* reserve space for mont->N.d[] copy */ + powerbufLen += top * sizeof(mont->N.d[0]); } #endif (void)0; @@ -971,7 +1010,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BN_ULONG *not_used, const BN_ULONG *np, const BN_ULONG *n0, int num); - BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2; + BN_ULONG *n0 = mont->n0, *np; /* * BN_to_montgomery can contaminate words above .top [in @@ -982,11 +1021,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, for (i = tmp.top; i < top; i++) tmp.d[i] = 0; - if (top & 7) - np2 = np; - else - for (np2 = am.d + top, i = 0; i < top; i++) - np2[2 * i] = np[i]; + /* + * copy mont->N.d[] to improve cache locality + */ + for (np = am.d + top, i = 0; i < top; i++) + np[i] = mont->N.d[i]; bn_scatter5(tmp.d, top, powerbuf, 0); bn_scatter5(am.d, am.top, powerbuf, 1); @@ -996,7 +1035,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, # if 0 for (i = 3; i < 32; i++) { /* Calculate a^i = a^(i-1) * a */ - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # else @@ -1007,7 +1046,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } for (i = 3; i < 8; i += 2) { int j; - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); for (j = 2 * i; j < 32; j *= 2) { bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); @@ -1015,13 +1054,13 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } } for (; i < 16; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); bn_scatter5(tmp.d, top, powerbuf, 2 * i); } for (; i < 32; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1); + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); bn_scatter5(tmp.d, top, powerbuf, i); } # endif @@ -1050,11 +1089,11 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, while (bits >= 0) { wvalue = bn_get_bits5(p->d, bits - 4); bits -= 5; - bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue); + bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); } } - ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top); + ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np, n0, top); tmp.top = top; bn_correct_top(&tmp); if (ret) { @@ -1065,9 +1104,9 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, } else #endif { - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window)) goto err; /* @@ -1079,15 +1118,15 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, if (window > 1) { if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF - (&tmp, top, powerbuf, 2, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, + window)) goto err; for (i = 3; i < numPowers; i++) { /* Calculate a^i = a^(i-1) * a */ if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF - (&tmp, top, powerbuf, i, numPowers)) + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, + window)) goto err; } } @@ -1095,8 +1134,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, bits--; for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF - (&tmp, top, powerbuf, wvalue, numPowers)) + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue, + window)) goto err; /* @@ -1116,8 +1155,8 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* * Fetch the appropriate pre-computed value from the pre-buf */ - if (!MOD_EXP_CTIME_COPY_FROM_PREBUF - (&am, top, powerbuf, wvalue, numPowers)) + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, + window)) goto err; /* Multiply the result into the intermediate result */ diff --git a/deps/openssl/openssl/crypto/bn/bn_print.c b/deps/openssl/openssl/crypto/bn/bn_print.c index ab10b957ba27d4..bfa31efc56216b 100644 --- a/deps/openssl/openssl/crypto/bn/bn_print.c +++ b/deps/openssl/openssl/crypto/bn/bn_print.c @@ -58,6 +58,7 @@ #include #include +#include #include "cryptlib.h" #include #include "bn_lcl.h" @@ -189,7 +190,11 @@ int BN_hex2bn(BIGNUM **bn, const char *a) a++; } - for (i = 0; isxdigit((unsigned char)a[i]); i++) ; + for (i = 0; i <= (INT_MAX/4) && isxdigit((unsigned char)a[i]); i++) + continue; + + if (i > INT_MAX/4) + goto err; num = i + neg; if (bn == NULL) @@ -204,7 +209,7 @@ int BN_hex2bn(BIGNUM **bn, const char *a) BN_zero(ret); } - /* i is the number of hex digests; */ + /* i is the number of hex digits */ if (bn_expand(ret, i * 4) == NULL) goto err; @@ -260,7 +265,11 @@ int BN_dec2bn(BIGNUM **bn, const char *a) a++; } - for (i = 0; isdigit((unsigned char)a[i]); i++) ; + for (i = 0; i <= (INT_MAX/4) && isdigit((unsigned char)a[i]); i++) + continue; + + if (i > INT_MAX/4) + goto err; num = i + neg; if (bn == NULL) @@ -278,7 +287,7 @@ int BN_dec2bn(BIGNUM **bn, const char *a) BN_zero(ret); } - /* i is the number of digests, a bit of an over expand; */ + /* i is the number of digits, a bit of an over expand */ if (bn_expand(ret, i * 4) == NULL) goto err; diff --git a/deps/openssl/openssl/crypto/bn/bn_recp.c b/deps/openssl/openssl/crypto/bn/bn_recp.c index 7497ac624d94e5..f047040efe03bd 100644 --- a/deps/openssl/openssl/crypto/bn/bn_recp.c +++ b/deps/openssl/openssl/crypto/bn/bn_recp.c @@ -65,6 +65,7 @@ void BN_RECP_CTX_init(BN_RECP_CTX *recp) BN_init(&(recp->N)); BN_init(&(recp->Nr)); recp->num_bits = 0; + recp->shift = 0; recp->flags = 0; } diff --git a/deps/openssl/openssl/crypto/cmac/cmac.c b/deps/openssl/openssl/crypto/cmac/cmac.c index 774e6dc919050d..2954b6eb7dcfeb 100644 --- a/deps/openssl/openssl/crypto/cmac/cmac.c +++ b/deps/openssl/openssl/crypto/cmac/cmac.c @@ -160,6 +160,14 @@ int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); return 0; } + + /* Switch to FIPS cipher implementation if possible */ + if (cipher != NULL) { + const EVP_CIPHER *fcipher; + fcipher = FIPS_get_cipherbynid(EVP_CIPHER_nid(cipher)); + if (fcipher != NULL) + cipher = fcipher; + } /* * Other algorithm blocking will be done in FIPS_cmac_init, via * FIPS_cipherinit(). diff --git a/deps/openssl/openssl/crypto/cryptlib.c b/deps/openssl/openssl/crypto/cryptlib.c index c9f674ba8e6249..1925428f5ec532 100644 --- a/deps/openssl/openssl/crypto/cryptlib.c +++ b/deps/openssl/openssl/crypto/cryptlib.c @@ -1016,11 +1016,11 @@ void *OPENSSL_stderr(void) return stderr; } -int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) +int CRYPTO_memcmp(const volatile void *in_a, const volatile void *in_b, size_t len) { size_t i; - const unsigned char *a = in_a; - const unsigned char *b = in_b; + const volatile unsigned char *a = in_a; + const volatile unsigned char *b = in_b; unsigned char x = 0; for (i = 0; i < len; i++) diff --git a/deps/openssl/openssl/crypto/crypto.h b/deps/openssl/openssl/crypto/crypto.h index c450d7a3c374b3..6c644ce12a8250 100644 --- a/deps/openssl/openssl/crypto/crypto.h +++ b/deps/openssl/openssl/crypto/crypto.h @@ -628,7 +628,7 @@ void OPENSSL_init(void); * into a defined order as the return value when a != b is undefined, other * than to be non-zero. */ -int CRYPTO_memcmp(const void *a, const void *b, size_t len); +int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len); /* BEGIN ERROR CODES */ /* diff --git a/deps/openssl/openssl/crypto/dh/dh.h b/deps/openssl/openssl/crypto/dh/dh.h index 5498a9dc106044..a5bd9016aae85a 100644 --- a/deps/openssl/openssl/crypto/dh/dh.h +++ b/deps/openssl/openssl/crypto/dh/dh.h @@ -174,7 +174,7 @@ struct dh_st { /* DH_check_pub_key error codes */ # define DH_CHECK_PUBKEY_TOO_SMALL 0x01 # define DH_CHECK_PUBKEY_TOO_LARGE 0x02 -# define DH_CHECK_PUBKEY_INVALID 0x03 +# define DH_CHECK_PUBKEY_INVALID 0x04 /* * primes p where (p-1)/2 is prime too are called "safe"; we define this for diff --git a/deps/openssl/openssl/crypto/dh/dh_check.c b/deps/openssl/openssl/crypto/dh/dh_check.c index 5adedc0d264e99..027704111432d2 100644 --- a/deps/openssl/openssl/crypto/dh/dh_check.c +++ b/deps/openssl/openssl/crypto/dh/dh_check.c @@ -160,13 +160,12 @@ int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *ret) goto err; BN_CTX_start(ctx); tmp = BN_CTX_get(ctx); - if (tmp == NULL) + if (tmp == NULL || !BN_set_word(tmp, 1)) goto err; - BN_set_word(tmp, 1); if (BN_cmp(pub_key, tmp) <= 0) *ret |= DH_CHECK_PUBKEY_TOO_SMALL; - BN_copy(tmp, dh->p); - BN_sub_word(tmp, 1); + if (BN_copy(tmp, dh->p) == NULL || !BN_sub_word(tmp, 1)) + goto err; if (BN_cmp(pub_key, tmp) >= 0) *ret |= DH_CHECK_PUBKEY_TOO_LARGE; diff --git a/deps/openssl/openssl/crypto/dsa/dsa_ameth.c b/deps/openssl/openssl/crypto/dsa/dsa_ameth.c index c40e1777ade1bf..cc83d6e6ad3b6f 100644 --- a/deps/openssl/openssl/crypto/dsa/dsa_ameth.c +++ b/deps/openssl/openssl/crypto/dsa/dsa_ameth.c @@ -191,6 +191,8 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8) STACK_OF(ASN1_TYPE) *ndsa = NULL; DSA *dsa = NULL; + int ret = 0; + if (!PKCS8_pkey_get0(NULL, &p, &pklen, &palg, p8)) return 0; X509_ALGOR_get0(NULL, &ptype, &pval, palg); @@ -262,23 +264,21 @@ static int dsa_priv_decode(EVP_PKEY *pkey, PKCS8_PRIV_KEY_INFO *p8) } EVP_PKEY_assign_DSA(pkey, dsa); - BN_CTX_free(ctx); - if (ndsa) - sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); - else - ASN1_STRING_clear_free(privkey); - return 1; + ret = 1; + goto done; decerr: - DSAerr(DSA_F_DSA_PRIV_DECODE, EVP_R_DECODE_ERROR); + DSAerr(DSA_F_DSA_PRIV_DECODE, DSA_R_DECODE_ERROR); dsaerr: + DSA_free(dsa); + done: BN_CTX_free(ctx); - if (privkey) + if (ndsa) + sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); + else ASN1_STRING_clear_free(privkey); - sk_ASN1_TYPE_pop_free(ndsa, ASN1_TYPE_free); - DSA_free(dsa); - return 0; + return ret; } static int dsa_priv_encode(PKCS8_PRIV_KEY_INFO *p8, const EVP_PKEY *pkey) diff --git a/deps/openssl/openssl/crypto/dso/dso_lib.c b/deps/openssl/openssl/crypto/dso/dso_lib.c index 3312450eae67d8..2beb7c1ba54246 100644 --- a/deps/openssl/openssl/crypto/dso/dso_lib.c +++ b/deps/openssl/openssl/crypto/dso/dso_lib.c @@ -122,6 +122,7 @@ DSO *DSO_new_method(DSO_METHOD *meth) ret->meth = meth; ret->references = 1; if ((ret->meth->init != NULL) && !ret->meth->init(ret)) { + sk_void_free(ret->meth_data); OPENSSL_free(ret); ret = NULL; } diff --git a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl index e6acfd59f0d416..7140860e245b45 100755 --- a/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/deps/openssl/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -2001,6 +2001,7 @@ () push %r15 sub \$32*5+8, %rsp +.Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x mov $a_ptr, $b_ptr # backup copy movdqu 0x10($a_ptr), %xmm1 @@ -2291,6 +2292,7 @@ () mov 0x40+8*1($b_ptr), $acc6 mov 0x40+8*2($b_ptr), $acc7 mov 0x40+8*3($b_ptr), $acc0 + movq $b_ptr, %xmm1 lea 0x40-$bias($b_ptr), $a_ptr lea $Z1sqr(%rsp), $r_ptr # Z1^2 @@ -2346,7 +2348,7 @@ () test $acc0, $acc0 jnz .Ladd_proceed$x # (in1infty || in2infty)? test $acc1, $acc1 - jz .Ladd_proceed$x # is_equal(S1,S2)? + jz .Ladd_double$x # is_equal(S1,S2)? movq %xmm0, $r_ptr # restore $r_ptr pxor %xmm0, %xmm0 @@ -2358,6 +2360,13 @@ () movdqu %xmm0, 0x50($r_ptr) jmp .Ladd_done$x +.align 32 +.Ladd_double$x: + movq %xmm1, $a_ptr # restore $a_ptr + movq %xmm0, $r_ptr # restore $r_ptr + add \$`32*(18-5)`, %rsp # difference in frame sizes + jmp .Lpoint_double_shortcut$x + .align 32 .Ladd_proceed$x: `&load_for_sqr("$R(%rsp)", "$src0")` diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp224.c b/deps/openssl/openssl/crypto/ec/ecp_nistp224.c index ed09f97ade6837..d81cc9ce6b1a0d 100644 --- a/deps/openssl/openssl/crypto/ec/ecp_nistp224.c +++ b/deps/openssl/openssl/crypto/ec/ecp_nistp224.c @@ -1657,8 +1657,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || @@ -1736,6 +1735,7 @@ int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp256.c b/deps/openssl/openssl/crypto/ec/ecp_nistp256.c index a5887086c63856..78d191aac7afbb 100644 --- a/deps/openssl/openssl/crypto/ec/ecp_nistp256.c +++ b/deps/openssl/openssl/crypto/ec/ecp_nistp256.c @@ -2249,8 +2249,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(x_tmp, &group->generator->X)) || (!BN_to_felem(y_tmp, &group->generator->Y)) || @@ -2337,6 +2336,7 @@ int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) diff --git a/deps/openssl/openssl/crypto/ec/ecp_nistp521.c b/deps/openssl/openssl/crypto/ec/ecp_nistp521.c index 360b9a3516f611..c53a61bbfb697d 100644 --- a/deps/openssl/openssl/crypto/ec/ecp_nistp521.c +++ b/deps/openssl/openssl/crypto/ec/ecp_nistp521.c @@ -2056,8 +2056,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) */ if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); - ret = 1; - goto err; + goto done; } if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || @@ -2115,6 +2114,7 @@ int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) } make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); + done: if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) diff --git a/deps/openssl/openssl/crypto/ec/ectest.c b/deps/openssl/openssl/crypto/ec/ectest.c index efab0b07b1d2ac..40a1f003259faf 100644 --- a/deps/openssl/openssl/crypto/ec/ectest.c +++ b/deps/openssl/openssl/crypto/ec/ectest.c @@ -1758,9 +1758,18 @@ static void nistp_single_test(const struct nistp_test_params *test) if (0 != EC_POINT_cmp(NISTP, Q, Q_CHECK, ctx)) ABORT; + /* + * We have not performed precomputation so have_precompute mult should be + * false + */ + if (EC_GROUP_have_precompute_mult(NISTP)) + ABORT; + /* now repeat all tests with precomputation */ if (!EC_GROUP_precompute_mult(NISTP, ctx)) ABORT; + if (!EC_GROUP_have_precompute_mult(NISTP)) + ABORT; /* fixed point multiplication */ EC_POINT_mul(NISTP, Q, m, NULL, NULL, ctx); diff --git a/deps/openssl/openssl/crypto/engine/eng_dyn.c b/deps/openssl/openssl/crypto/engine/eng_dyn.c index 3169b09ad86511..40f30e9d585e21 100644 --- a/deps/openssl/openssl/crypto/engine/eng_dyn.c +++ b/deps/openssl/openssl/crypto/engine/eng_dyn.c @@ -243,8 +243,10 @@ static int dynamic_set_data_ctx(ENGINE *e, dynamic_data_ctx **ctx) * If we lost the race to set the context, c is non-NULL and *ctx is the * context of the thread that won. */ - if (c) + if (c) { + sk_OPENSSL_STRING_free(c->dirs); OPENSSL_free(c); + } return 1; } diff --git a/deps/openssl/openssl/crypto/evp/e_des.c b/deps/openssl/openssl/crypto/evp/e_des.c index aae13a675694b4..8ca65cd03ae1f1 100644 --- a/deps/openssl/openssl/crypto/evp/e_des.c +++ b/deps/openssl/openssl/crypto/evp/e_des.c @@ -71,12 +71,13 @@ typedef struct { DES_key_schedule ks; } ks; union { - void (*cbc) (const void *, void *, size_t, const void *, void *); + void (*cbc) (const void *, void *, size_t, + const DES_key_schedule *, unsigned char *); } stream; } EVP_DES_KEY; # if defined(AES_ASM) && (defined(__sparc) || defined(__sparc__)) -/* ---------^^^ this is not a typo, just a way to detect that +/* ----------^^^ this is not a typo, just a way to detect that * assembler support was in general requested... */ # include "sparc_arch.h" @@ -86,9 +87,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[]; void des_t4_key_expand(const void *key, DES_key_schedule *ks); void des_t4_cbc_encrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule *ks, unsigned char iv[8]); void des_t4_cbc_decrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule *ks, unsigned char iv[8]); # endif static int des_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, @@ -130,7 +131,7 @@ static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data; - if (dat->stream.cbc) { + if (dat->stream.cbc != NULL) { (*dat->stream.cbc) (in, out, inl, &dat->ks.ks, ctx->iv); return 1; } diff --git a/deps/openssl/openssl/crypto/evp/e_des3.c b/deps/openssl/openssl/crypto/evp/e_des3.c index bf6c1d2d3d39a4..0e910d6d8085b9 100644 --- a/deps/openssl/openssl/crypto/evp/e_des3.c +++ b/deps/openssl/openssl/crypto/evp/e_des3.c @@ -75,7 +75,8 @@ typedef struct { DES_key_schedule ks[3]; } ks; union { - void (*cbc) (const void *, void *, size_t, const void *, void *); + void (*cbc) (const void *, void *, size_t, + const DES_key_schedule *, unsigned char *); } stream; } DES_EDE_KEY; # define ks1 ks.ks[0] @@ -93,9 +94,9 @@ extern unsigned int OPENSSL_sparcv9cap_P[]; void des_t4_key_expand(const void *key, DES_key_schedule *ks); void des_t4_ede3_cbc_encrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule ks[3], unsigned char iv[8]); void des_t4_ede3_cbc_decrypt(const void *inp, void *out, size_t len, - DES_key_schedule *ks, unsigned char iv[8]); + const DES_key_schedule ks[3], unsigned char iv[8]); # endif static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, @@ -162,7 +163,7 @@ static int des_ede_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, } # endif /* KSSL_DEBUG */ if (dat->stream.cbc) { - (*dat->stream.cbc) (in, out, inl, &dat->ks, ctx->iv); + (*dat->stream.cbc) (in, out, inl, dat->ks.ks, ctx->iv); return 1; } @@ -395,7 +396,7 @@ static int des_ede3_unwrap(EVP_CIPHER_CTX *ctx, unsigned char *out, int rv = -1; if (inl < 24) return -1; - if (!out) + if (out == NULL) return inl - 16; memcpy(ctx->iv, wrap_iv, 8); /* Decrypt first block which will end up as icv */ @@ -438,7 +439,7 @@ static int des_ede3_wrap(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) { unsigned char sha1tmp[SHA_DIGEST_LENGTH]; - if (!out) + if (out == NULL) return inl + 16; /* Copy input to output buffer + 8 so we have space for IV */ memmove(out + 8, in, inl); diff --git a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl index bd6bf72fe48767..980cfd23efe34e 100644 --- a/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -43,7 +43,7 @@ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); + $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && @@ -489,7 +489,7 @@ ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 - movaps -0xd8(%rax),%xmm7 + movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 diff --git a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl index 4ff2d39aa7b281..f889f2018789fb 100644 --- a/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl +++ b/deps/openssl/openssl/crypto/modes/asm/ghash-x86_64.pl @@ -92,7 +92,7 @@ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22); + $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && diff --git a/deps/openssl/openssl/crypto/modes/ctr128.c b/deps/openssl/openssl/crypto/modes/ctr128.c index f3bbcbf7237609..bcafd6b6bfb169 100644 --- a/deps/openssl/openssl/crypto/modes/ctr128.c +++ b/deps/openssl/openssl/crypto/modes/ctr128.c @@ -67,23 +67,20 @@ /* increment counter (128-bit int) by 1 */ static void ctr128_inc(unsigned char *counter) { - u32 n = 16; - u8 c; + u32 n = 16, c = 1; do { --n; - c = counter[n]; - ++c; - counter[n] = c; - if (c) - return; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; } while (n); } #if !defined(OPENSSL_SMALL_FOOTPRINT) static void ctr128_inc_aligned(unsigned char *counter) { - size_t *data, c, n; + size_t *data, c, d, n; const union { long one; char little; @@ -91,20 +88,19 @@ static void ctr128_inc_aligned(unsigned char *counter) 1 }; - if (is_endian.little) { + if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) { ctr128_inc(counter); return; } data = (size_t *)counter; + c = 1; n = 16 / sizeof(size_t); do { --n; - c = data[n]; - ++c; - data[n] = c; - if (c) - return; + d = data[n] += c; + /* did addition carry? */ + c = ((d - c) ^ d) >> (sizeof(size_t) * 8 - 1); } while (n); } #endif @@ -144,14 +140,14 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, } # if defined(STRICT_ALIGNMENT) - if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != - 0) + if (((size_t)in | (size_t)out | (size_t)ecount_buf) + % sizeof(size_t) != 0) break; # endif while (len >= 16) { (*block) (ivec, ecount_buf, key); ctr128_inc_aligned(ivec); - for (; n < 16; n += sizeof(size_t)) + for (n = 0; n < 16; n += sizeof(size_t)) *(size_t *)(out + n) = *(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n); len -= 16; @@ -189,16 +185,13 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, /* increment upper 96 bits of 128-bit counter by 1 */ static void ctr96_inc(unsigned char *counter) { - u32 n = 12; - u8 c; + u32 n = 12, c = 1; do { --n; - c = counter[n]; - ++c; - counter[n] = c; - if (c) - return; + c += counter[n]; + counter[n] = (u8)c; + c >>= 8; } while (n); } diff --git a/deps/openssl/openssl/crypto/opensslv.h b/deps/openssl/openssl/crypto/opensslv.h index 03b8c4843784a7..4334fd15cd8739 100644 --- a/deps/openssl/openssl/crypto/opensslv.h +++ b/deps/openssl/openssl/crypto/opensslv.h @@ -30,11 +30,11 @@ extern "C" { * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for * major minor fix final patch/beta) */ -# define OPENSSL_VERSION_NUMBER 0x1000206fL +# define OPENSSL_VERSION_NUMBER 0x1000207fL # ifdef OPENSSL_FIPS -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016" # else -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016" # endif # define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT diff --git a/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl b/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl index 9c70b8c2c6e9d7..ee04221c7e1da7 100755 --- a/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl +++ b/deps/openssl/openssl/crypto/perlasm/x86_64-xlate.pl @@ -198,8 +198,11 @@ if ($gas) { # Solaris /usr/ccs/bin/as can't handle multiplications # in $self->{value} - $self->{value} =~ s/(?{value} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg; + my $value = $self->{value}; + $value =~ s/(?{value} = $value; + } sprintf "\$%s",$self->{value}; } else { $self->{value} =~ s/(0b[0-1]+)/oct($1)/eig; diff --git a/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c b/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c index c4d3724d2a48b0..dc9b484078afc9 100644 --- a/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c +++ b/deps/openssl/openssl/crypto/pkcs7/pk7_smime.c @@ -274,12 +274,29 @@ int PKCS7_verify(PKCS7 *p7, STACK_OF(X509) *certs, X509_STORE *store, PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_NO_CONTENT); return 0; } +#if 0 + /* + * NB: this test commented out because some versions of Netscape + * illegally include zero length content when signing data. Also + * Microsoft Authenticode includes a SpcIndirectDataContent data + * structure which describes the content to be protected by the + * signature, rather than directly embedding that content. So + * Authenticode implementations are also expected to use + * PKCS7_verify() with explicit external data, on non-detached + * PKCS#7 signatures. + * + * In OpenSSL 1.1 a new flag PKCS7_NO_DUAL_CONTENT has been + * introduced to disable this sanity check. For the 1.0.2 branch + * this change is not acceptable, so the check remains completely + * commented out (as it has been for a long time). + */ /* Check for data and content: two sets of data */ if (!PKCS7_get_detached(p7) && indata) { PKCS7err(PKCS7_F_PKCS7_VERIFY, PKCS7_R_CONTENT_AND_DATA_PRESENT); return 0; } +#endif sinfos = PKCS7_get_signer_info(p7); diff --git a/deps/openssl/openssl/crypto/rsa/rsa_sign.c b/deps/openssl/openssl/crypto/rsa/rsa_sign.c index ed63a1d8b0e3fe..82ca8324dfbc11 100644 --- a/deps/openssl/openssl/crypto/rsa/rsa_sign.c +++ b/deps/openssl/openssl/crypto/rsa/rsa_sign.c @@ -84,7 +84,7 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len, return 0; } #endif - if (rsa->meth->rsa_sign) { + if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) { return rsa->meth->rsa_sign(type, m, m_len, sigret, siglen, rsa); } /* Special case: SSL signature, just check the length */ @@ -293,7 +293,7 @@ int RSA_verify(int dtype, const unsigned char *m, unsigned int m_len, const unsigned char *sigbuf, unsigned int siglen, RSA *rsa) { - if (rsa->meth->rsa_verify) { + if ((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_verify) { return rsa->meth->rsa_verify(dtype, m, m_len, sigbuf, siglen, rsa); } diff --git a/deps/openssl/openssl/crypto/srp/srp.h b/deps/openssl/openssl/crypto/srp/srp.h index d072536fec9bb3..028892a1ff5e04 100644 --- a/deps/openssl/openssl/crypto/srp/srp.h +++ b/deps/openssl/openssl/crypto/srp/srp.h @@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st { DECLARE_STACK_OF(SRP_gN_cache) typedef struct SRP_user_pwd_st { + /* Owned by us. */ char *id; BIGNUM *s; BIGNUM *v; + /* Not owned by us. */ const BIGNUM *g; const BIGNUM *N; + /* Owned by us. */ char *info; } SRP_user_pwd; DECLARE_STACK_OF(SRP_user_pwd) +void SRP_user_pwd_free(SRP_user_pwd *user_pwd); + typedef struct SRP_VBASE_st { STACK_OF(SRP_user_pwd) *users_pwd; STACK_OF(SRP_gN_cache) *gN_cache; @@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN) SRP_VBASE *SRP_VBASE_new(char *seed_key); int SRP_VBASE_free(SRP_VBASE *vb); int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file); + +/* This method ignores the configured seed and fails for an unknown user. */ SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username); +/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/ +SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username); + char *SRP_create_verifier(const char *user, const char *pass, char **salt, char **verifier, const char *N, const char *g); int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, diff --git a/deps/openssl/openssl/crypto/srp/srp_vfy.c b/deps/openssl/openssl/crypto/srp/srp_vfy.c index a3f1a8a0a4d5bd..26ad3e07b4bb3f 100644 --- a/deps/openssl/openssl/crypto/srp/srp_vfy.c +++ b/deps/openssl/openssl/crypto/srp/srp_vfy.c @@ -185,7 +185,7 @@ static char *t_tob64(char *dst, const unsigned char *src, int size) return olddst; } -static void SRP_user_pwd_free(SRP_user_pwd *user_pwd) +void SRP_user_pwd_free(SRP_user_pwd *user_pwd) { if (user_pwd == NULL) return; @@ -247,6 +247,24 @@ static int SRP_user_pwd_set_sv_BN(SRP_user_pwd *vinfo, BIGNUM *s, BIGNUM *v) return (vinfo->s != NULL && vinfo->v != NULL); } +static SRP_user_pwd *srp_user_pwd_dup(SRP_user_pwd *src) +{ + SRP_user_pwd *ret; + + if (src == NULL) + return NULL; + if ((ret = SRP_user_pwd_new()) == NULL) + return NULL; + + SRP_user_pwd_set_gN(ret, src->g, src->N); + if (!SRP_user_pwd_set_ids(ret, src->id, src->info) + || !SRP_user_pwd_set_sv_BN(ret, BN_dup(src->s), BN_dup(src->v))) { + SRP_user_pwd_free(ret); + return NULL; + } + return ret; +} + SRP_VBASE *SRP_VBASE_new(char *seed_key) { SRP_VBASE *vb = (SRP_VBASE *)OPENSSL_malloc(sizeof(SRP_VBASE)); @@ -468,21 +486,50 @@ int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file) } -SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username) +static SRP_user_pwd *find_user(SRP_VBASE *vb, char *username) { int i; SRP_user_pwd *user; - unsigned char digv[SHA_DIGEST_LENGTH]; - unsigned char digs[SHA_DIGEST_LENGTH]; - EVP_MD_CTX ctxt; if (vb == NULL) return NULL; + for (i = 0; i < sk_SRP_user_pwd_num(vb->users_pwd); i++) { user = sk_SRP_user_pwd_value(vb->users_pwd, i); if (strcmp(user->id, username) == 0) return user; } + + return NULL; +} + +/* + * This method ignores the configured seed and fails for an unknown user. + * Ownership of the returned pointer is not released to the caller. + * In other words, caller must not free the result. + */ +SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username) +{ + return find_user(vb, username); +} + +/* + * Ownership of the returned pointer is released to the caller. + * In other words, caller must free the result once done. + */ +SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username) +{ + SRP_user_pwd *user; + unsigned char digv[SHA_DIGEST_LENGTH]; + unsigned char digs[SHA_DIGEST_LENGTH]; + EVP_MD_CTX ctxt; + + if (vb == NULL) + return NULL; + + if ((user = find_user(vb, username)) != NULL) + return srp_user_pwd_dup(user); + if ((vb->seed_key == NULL) || (vb->default_g == NULL) || (vb->default_N == NULL)) return NULL; diff --git a/deps/openssl/openssl/crypto/stack/stack.c b/deps/openssl/openssl/crypto/stack/stack.c index de437acf6a5cb6..fa50083e22b3b9 100644 --- a/deps/openssl/openssl/crypto/stack/stack.c +++ b/deps/openssl/openssl/crypto/stack/stack.c @@ -360,7 +360,7 @@ void *sk_set(_STACK *st, int i, void *value) void sk_sort(_STACK *st) { - if (st && !st->sorted) { + if (st && !st->sorted && st->comp != NULL) { int (*comp_func) (const void *, const void *); /* diff --git a/deps/openssl/openssl/crypto/x509/x509_vfy.c b/deps/openssl/openssl/crypto/x509/x509_vfy.c index 0429767032fd9c..4d34dbac93149e 100644 --- a/deps/openssl/openssl/crypto/x509/x509_vfy.c +++ b/deps/openssl/openssl/crypto/x509/x509_vfy.c @@ -194,6 +194,9 @@ int X509_verify_cert(X509_STORE_CTX *ctx) int num, j, retry; int (*cb) (int xok, X509_STORE_CTX *xctx); STACK_OF(X509) *sktmp = NULL; + int trust = X509_TRUST_UNTRUSTED; + int err; + if (ctx->cert == NULL) { X509err(X509_F_X509_VERIFY_CERT, X509_R_NO_CERT_SET_FOR_US_TO_VERIFY); return -1; @@ -216,7 +219,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (((ctx->chain = sk_X509_new_null()) == NULL) || (!sk_X509_push(ctx->chain, ctx->cert))) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } CRYPTO_add(&ctx->cert->references, 1, CRYPTO_LOCK_X509); ctx->last_untrusted = 1; @@ -225,7 +229,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (ctx->untrusted != NULL && (sktmp = sk_X509_dup(ctx->untrusted)) == NULL) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } num = sk_X509_num(ctx->chain); @@ -249,7 +254,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) { ok = ctx->get_issuer(&xtmp, ctx, x); if (ok < 0) - goto end; + goto err; /* * If successful for now free up cert so it will be picked up * again later. @@ -266,7 +271,8 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (xtmp != NULL) { if (!sk_X509_push(ctx->chain, xtmp)) { X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - goto end; + ok = -1; + goto err; } CRYPTO_add(&xtmp->references, 1, CRYPTO_LOCK_X509); (void)sk_X509_delete_ptr(sktmp, xtmp); @@ -314,7 +320,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) bad_chain = 1; ok = cb(0, ctx); if (!ok) - goto end; + goto err; } else { /* * We have a match: replace certificate with store @@ -347,25 +353,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx) ok = ctx->get_issuer(&xtmp, ctx, x); if (ok < 0) - goto end; + goto err; if (ok == 0) break; x = xtmp; if (!sk_X509_push(ctx->chain, x)) { X509_free(xtmp); X509err(X509_F_X509_VERIFY_CERT, ERR_R_MALLOC_FAILURE); - ok = 0; - goto end; + ok = -1; + goto err; } num++; } /* we now have our chain, lets check it... */ - i = check_trust(ctx); + if ((trust = check_trust(ctx)) == X509_TRUST_REJECTED) { + /* Callback already issued */ + ok = 0; + goto err; + } - /* If explicitly rejected error */ - if (i == X509_TRUST_REJECTED) - goto end; /* * If it's not explicitly trusted then check if there is an alternative * chain that could be used. We only do this if we haven't already @@ -373,14 +380,14 @@ int X509_verify_cert(X509_STORE_CTX *ctx) * chain checking */ retry = 0; - if (i != X509_TRUST_TRUSTED + if (trust != X509_TRUST_TRUSTED && !(ctx->param->flags & X509_V_FLAG_TRUSTED_FIRST) && !(ctx->param->flags & X509_V_FLAG_NO_ALT_CHAINS)) { while (j-- > 1) { xtmp2 = sk_X509_value(ctx->chain, j - 1); ok = ctx->get_issuer(&xtmp, ctx, xtmp2); if (ok < 0) - goto end; + goto err; /* Check if we found an alternate chain */ if (ok > 0) { /* @@ -410,7 +417,7 @@ int X509_verify_cert(X509_STORE_CTX *ctx) * self signed certificate in which case we've indicated an error already * and set bad_chain == 1 */ - if (i != X509_TRUST_TRUSTED && !bad_chain) { + if (trust != X509_TRUST_TRUSTED && !bad_chain) { if ((chain_ss == NULL) || !ctx->check_issued(ctx, x, chain_ss)) { if (ctx->last_untrusted >= num) ctx->error = X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY; @@ -431,26 +438,26 @@ int X509_verify_cert(X509_STORE_CTX *ctx) bad_chain = 1; ok = cb(0, ctx); if (!ok) - goto end; + goto err; } /* We have the chain complete: now we need to check its purpose */ ok = check_chain_extensions(ctx); if (!ok) - goto end; + goto err; /* Check name constraints */ ok = check_name_constraints(ctx); if (!ok) - goto end; + goto err; ok = check_id(ctx); if (!ok) - goto end; + goto err; /* We may as well copy down any DSA parameters that are required */ X509_get_pubkey_parameters(NULL, ctx->chain); @@ -462,16 +469,16 @@ int X509_verify_cert(X509_STORE_CTX *ctx) ok = ctx->check_revocation(ctx); if (!ok) - goto end; + goto err; - i = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain, - ctx->param->flags); - if (i != X509_V_OK) { - ctx->error = i; + err = X509_chain_check_suiteb(&ctx->error_depth, NULL, ctx->chain, + ctx->param->flags); + if (err != X509_V_OK) { + ctx->error = err; ctx->current_cert = sk_X509_value(ctx->chain, ctx->error_depth); ok = cb(0, ctx); if (!ok) - goto end; + goto err; } /* At this point, we have a chain and need to verify it */ @@ -480,25 +487,28 @@ int X509_verify_cert(X509_STORE_CTX *ctx) else ok = internal_verify(ctx); if (!ok) - goto end; + goto err; #ifndef OPENSSL_NO_RFC3779 /* RFC 3779 path validation, now that CRL check has been done */ ok = v3_asid_validate_path(ctx); if (!ok) - goto end; + goto err; ok = v3_addr_validate_path(ctx); if (!ok) - goto end; + goto err; #endif /* If we get this far evaluate policies */ if (!bad_chain && (ctx->param->flags & X509_V_FLAG_POLICY_CHECK)) ok = ctx->check_policy(ctx); if (!ok) - goto end; + goto err; if (0) { - end: + err: + /* Ensure we return an error */ + if (ok > 0) + ok = 0; X509_get_pubkey_parameters(NULL, ctx->chain); } if (sktmp != NULL) diff --git a/deps/openssl/openssl/doc/apps/ciphers.pod b/deps/openssl/openssl/doc/apps/ciphers.pod index 1c26e3b3da36ab..9643b4d48ca8c1 100644 --- a/deps/openssl/openssl/doc/apps/ciphers.pod +++ b/deps/openssl/openssl/doc/apps/ciphers.pod @@ -38,25 +38,21 @@ SSL v2 and for SSL v3/TLS v1. Like B<-v>, but include cipher suite codes in output (hex format). -=item B<-ssl3> +=item B<-ssl3>, B<-tls1> -only include SSL v3 ciphers. +This lists ciphers compatible with any of SSLv3, TLSv1, TLSv1.1 or TLSv1.2. =item B<-ssl2> -only include SSL v2 ciphers. - -=item B<-tls1> - -only include TLS v1 ciphers. +Only include SSLv2 ciphers. =item B<-h>, B<-?> -print a brief usage message. +Print a brief usage message. =item B -a cipher list to convert to a cipher preference list. If it is not included +A cipher list to convert to a cipher preference list. If it is not included then the default cipher list will be used. The format is described below. =back @@ -109,9 +105,10 @@ The following is a list of all permitted cipher strings and their meanings. =item B -the default cipher list. This is determined at compile time and -is normally B. This must be the firstcipher string -specified. +The default cipher list. +This is determined at compile time and is normally +B. +When used, this must be the first cipherstring specified. =item B @@ -139,34 +136,46 @@ than 128 bits, and some cipher suites with 128-bit keys. =item B -"low" encryption cipher suites, currently those using 64 or 56 bit encryption algorithms -but excluding export cipher suites. +Low strength encryption cipher suites, currently those using 64 or 56 bit +encryption algorithms but excluding export cipher suites. +As of OpenSSL 1.0.2g, these are disabled in default builds. =item B, B -export encryption algorithms. Including 40 and 56 bits algorithms. +Export strength encryption algorithms. Including 40 and 56 bits algorithms. +As of OpenSSL 1.0.2g, these are disabled in default builds. =item B -40 bit export encryption algorithms +40-bit export encryption algorithms +As of OpenSSL 1.0.2g, these are disabled in default builds. =item B -56 bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of +56-bit export encryption algorithms. In OpenSSL 0.9.8c and later the set of 56 bit export ciphers is empty unless OpenSSL has been explicitly configured with support for experimental ciphers. +As of OpenSSL 1.0.2g, these are disabled in default builds. =item B, B -the "NULL" ciphers that is those offering no encryption. Because these offer no -encryption at all and are a security risk they are disabled unless explicitly -included. +The "NULL" ciphers that is those offering no encryption. Because these offer no +encryption at all and are a security risk they are not enabled via either the +B or B cipher strings. +Be careful when building cipherlists out of lower-level primitives such as +B or B as these do overlap with the B ciphers. +When in doubt, include B in your cipherlist. =item B -the cipher suites offering no authentication. This is currently the anonymous +The cipher suites offering no authentication. This is currently the anonymous DH algorithms and anonymous ECDH algorithms. These cipher suites are vulnerable to a "man in the middle" attack and so their use is normally discouraged. +These are excluded from the B ciphers, but included in the B +ciphers. +Be careful when building cipherlists out of lower-level primitives such as +B or B as these do overlap with the B ciphers. +When in doubt, include B in your cipherlist. =item B, B @@ -582,11 +591,11 @@ Note: these ciphers can also be used in SSL v3. =head2 Deprecated SSL v2.0 cipher suites. SSL_CK_RC4_128_WITH_MD5 RC4-MD5 - SSL_CK_RC4_128_EXPORT40_WITH_MD5 EXP-RC4-MD5 - SSL_CK_RC2_128_CBC_WITH_MD5 RC2-MD5 - SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5 EXP-RC2-MD5 + SSL_CK_RC4_128_EXPORT40_WITH_MD5 Not implemented. + SSL_CK_RC2_128_CBC_WITH_MD5 RC2-CBC-MD5 + SSL_CK_RC2_128_CBC_EXPORT40_WITH_MD5 Not implemented. SSL_CK_IDEA_128_CBC_WITH_MD5 IDEA-CBC-MD5 - SSL_CK_DES_64_CBC_WITH_MD5 DES-CBC-MD5 + SSL_CK_DES_64_CBC_WITH_MD5 Not implemented. SSL_CK_DES_192_EDE3_CBC_WITH_MD5 DES-CBC3-MD5 =head1 NOTES diff --git a/deps/openssl/openssl/doc/apps/pkeyutl.pod b/deps/openssl/openssl/doc/apps/pkeyutl.pod index 27be9a90079f6a..5da347c97d327c 100644 --- a/deps/openssl/openssl/doc/apps/pkeyutl.pod +++ b/deps/openssl/openssl/doc/apps/pkeyutl.pod @@ -137,6 +137,19 @@ Unless otherwise mentioned all algorithms support the B option which specifies the digest in use for sign, verify and verifyrecover operations. The value B should represent a digest name as used in the EVP_get_digestbyname() function for example B. +This value is used only for sanity-checking the lengths of data passed in to +the B and for creating the structures that make up the signature +(e.g. B in RSASSA PKCS#1 v1.5 signatures). +In case of RSA, ECDSA and DSA signatures, this utility +will not perform hashing on input data but rather use the data directly as +input of signature algorithm. Depending on key type, signature type and mode +of padding, the maximum acceptable lengths of input data differ. In general, +with RSA the signed data can't be longer than the key modulus, in case of ECDSA +and DSA the data shouldn't be longer than field size, otherwise it will be +silently truncated to field size. + +In other words, if the value of digest is B the input should be 20 bytes +long binary encoding of SHA-1 hash function output. =head1 RSA ALGORITHM diff --git a/deps/openssl/openssl/doc/apps/req.pod b/deps/openssl/openssl/doc/apps/req.pod index 54a4d394d282f0..30653e50935777 100644 --- a/deps/openssl/openssl/doc/apps/req.pod +++ b/deps/openssl/openssl/doc/apps/req.pod @@ -347,9 +347,12 @@ configuration file values. =item B -This specifies the default key size in bits. If not specified then -512 is used. It is used if the B<-new> option is used. It can be -overridden by using the B<-newkey> option. +Specifies the default key size in bits. + +This option is used in conjunction with the B<-new> option to generate +a new key. It can be overridden by specifying an explicit key size in +the B<-newkey> option. The smallest accepted key size is 512 bits. If +no key size is specified then 2048 bits is used. =item B diff --git a/deps/openssl/openssl/doc/apps/s_client.pod b/deps/openssl/openssl/doc/apps/s_client.pod index 84d0527069418d..618df9659d3bc6 100644 --- a/deps/openssl/openssl/doc/apps/s_client.pod +++ b/deps/openssl/openssl/doc/apps/s_client.pod @@ -201,15 +201,11 @@ Use the PSK key B when using a PSK cipher suite. The key is given as a hexadecimal number without leading 0x, for example -psk 1a2b3c4d. -=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2> +=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2> -these options disable the use of certain SSL or TLS protocols. By default -the initial handshake uses a method which should be compatible with all -servers and permit them to use SSL v3, SSL v2 or TLS as appropriate. - -Unfortunately there are still ancient and broken servers in use which -cannot handle this technique and will fail to connect. Some servers only -work if TLS is turned off. +These options require or disable the use of the specified SSL or TLS protocols. +By default the initial handshake uses a I method which will +negotiate the highest mutually supported protocol version. =item B<-fallback_scsv> diff --git a/deps/openssl/openssl/doc/apps/s_server.pod b/deps/openssl/openssl/doc/apps/s_server.pod index baca7792446f2e..6f4acb7006ffdc 100644 --- a/deps/openssl/openssl/doc/apps/s_server.pod +++ b/deps/openssl/openssl/doc/apps/s_server.pod @@ -217,11 +217,11 @@ Use the PSK key B when using a PSK cipher suite. The key is given as a hexadecimal number without leading 0x, for example -psk 1a2b3c4d. -=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1> +=item B<-ssl2>, B<-ssl3>, B<-tls1>, B<-tls1_1>, B<-tls1_2>, B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2> -these options disable the use of certain SSL or TLS protocols. By default -the initial handshake uses a method which should be compatible with all -servers and permit them to use SSL v3, SSL v2 or TLS as appropriate. +These options require or disable the use of the specified SSL or TLS protocols. +By default the initial handshake uses a I method which will +negotiate the highest mutually supported protocol version. =item B<-bugs> diff --git a/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod b/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod index 8f85e0dceeb7f2..9f239648d7525b 100644 --- a/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod +++ b/deps/openssl/openssl/doc/crypto/BIO_s_mem.pod @@ -16,7 +16,7 @@ BIO_get_mem_ptr, BIO_new_mem_buf - memory BIO BIO_set_mem_buf(BIO *b,BUF_MEM *bm,int c) BIO_get_mem_ptr(BIO *b,BUF_MEM **pp) - BIO *BIO_new_mem_buf(void *buf, int len); + BIO *BIO_new_mem_buf(const void *buf, int len); =head1 DESCRIPTION @@ -61,7 +61,7 @@ BIO_get_mem_ptr() places the underlying BUF_MEM structure in B. It is a macro. BIO_new_mem_buf() creates a memory BIO using B bytes of data at B, -if B is -1 then the B is assumed to be null terminated and its +if B is -1 then the B is assumed to be nul terminated and its length is determined by B. The BIO is set to a read only state and as a result cannot be written to. This is useful when some data needs to be made available from a static area of memory in the form of a BIO. The diff --git a/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod b/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod index 2bf1a60e9013fd..e81d76ae779ad4 100644 --- a/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod +++ b/deps/openssl/openssl/doc/ssl/SSL_CONF_cmd.pod @@ -74,7 +74,7 @@ B). Curve names are case sensitive. =item B<-named_curve> -This sets the temporary curve used for ephemeral ECDH modes. Only used by +This sets the temporary curve used for ephemeral ECDH modes. Only used by servers The B argument is a curve name or the special value B which @@ -85,7 +85,7 @@ can be either the B name (e.g. B) or an OpenSSL OID name =item B<-cipher> Sets the cipher suite list to B. Note: syntax checking of B is -currently not performed unless a B or B structure is +currently not performed unless a B or B structure is associated with B. =item B<-cert> @@ -111,9 +111,9 @@ operations are permitted. =item B<-no_ssl2>, B<-no_ssl3>, B<-no_tls1>, B<-no_tls1_1>, B<-no_tls1_2> -Disables protocol support for SSLv2, SSLv3, TLS 1.0, TLS 1.1 or TLS 1.2 -by setting the corresponding options B, B, -B, B and B respectively. +Disables protocol support for SSLv2, SSLv3, TLSv1.0, TLSv1.1 or TLSv1.2 +by setting the corresponding options B, B, +B, B and B respectively. =item B<-bugs> @@ -177,7 +177,7 @@ Note: the command prefix (if set) alters the recognised B values. =item B Sets the cipher suite list to B. Note: syntax checking of B is -currently not performed unless an B or B structure is +currently not performed unless an B or B structure is associated with B. =item B @@ -244,7 +244,7 @@ B). Curve names are case sensitive. =item B -This sets the temporary curve used for ephemeral ECDH modes. Only used by +This sets the temporary curve used for ephemeral ECDH modes. Only used by servers The B argument is a curve name or the special value B which @@ -258,10 +258,11 @@ The supported versions of the SSL or TLS protocol. The B argument is a comma separated list of supported protocols to enable or disable. If an protocol is preceded by B<-> that version is disabled. -All versions are enabled by default, though applications may choose to -explicitly disable some. Currently supported protocol values are B, -B, B, B and B. The special value B refers -to all supported versions. +Currently supported protocol values are B, B, B, +B and B. +All protocol versions other than B are enabled by default. +To avoid inadvertent enabling of B, when SSLv2 is disabled, it is not +possible to enable it via the B command. =item B @@ -339,16 +340,16 @@ The value is a directory name. The order of operations is significant. This can be used to set either defaults or values which cannot be overridden. For example if an application calls: - SSL_CONF_cmd(ctx, "Protocol", "-SSLv2"); + SSL_CONF_cmd(ctx, "Protocol", "-SSLv3"); SSL_CONF_cmd(ctx, userparam, uservalue); -it will disable SSLv2 support by default but the user can override it. If +it will disable SSLv3 support by default but the user can override it. If however the call sequence is: SSL_CONF_cmd(ctx, userparam, uservalue); - SSL_CONF_cmd(ctx, "Protocol", "-SSLv2"); + SSL_CONF_cmd(ctx, "Protocol", "-SSLv3"); -SSLv2 is B disabled and attempt to override this by the user are +then SSLv3 is B disabled and attempt to override this by the user are ignored. By checking the return code of SSL_CTX_cmd() it is possible to query if a @@ -372,7 +373,7 @@ can be checked instead. If -3 is returned a required argument is missing and an error is indicated. If 0 is returned some other error occurred and this can be reported back to the user. -The function SSL_CONF_cmd_value_type() can be used by applications to +The function SSL_CONF_cmd_value_type() can be used by applications to check for the existence of a command or to perform additional syntax checking or translation of the command value. For example if the return value is B an application could translate a relative diff --git a/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod b/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod index 491ac8c172cb5e..b8cc8797845149 100644 --- a/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod +++ b/deps/openssl/openssl/doc/ssl/SSL_CTX_new.pod @@ -2,13 +2,55 @@ =head1 NAME -SSL_CTX_new - create a new SSL_CTX object as framework for TLS/SSL enabled functions +SSL_CTX_new, +SSLv23_method, SSLv23_server_method, SSLv23_client_method, +TLSv1_2_method, TLSv1_2_server_method, TLSv1_2_client_method, +TLSv1_1_method, TLSv1_1_server_method, TLSv1_1_client_method, +TLSv1_method, TLSv1_server_method, TLSv1_client_method, +SSLv3_method, SSLv3_server_method, SSLv3_client_method, +SSLv2_method, SSLv2_server_method, SSLv2_client_method, +DTLS_method, DTLS_server_method, DTLS_client_method, +DTLSv1_2_method, DTLSv1_2_server_method, DTLSv1_2_client_method, +DTLSv1_method, DTLSv1_server_method, DTLSv1_client_method - +create a new SSL_CTX object as framework for TLS/SSL enabled functions =head1 SYNOPSIS #include SSL_CTX *SSL_CTX_new(const SSL_METHOD *method); + const SSL_METHOD *SSLv23_method(void); + const SSL_METHOD *SSLv23_server_method(void); + const SSL_METHOD *SSLv23_client_method(void); + const SSL_METHOD *TLSv1_2_method(void); + const SSL_METHOD *TLSv1_2_server_method(void); + const SSL_METHOD *TLSv1_2_client_method(void); + const SSL_METHOD *TLSv1_1_method(void); + const SSL_METHOD *TLSv1_1_server_method(void); + const SSL_METHOD *TLSv1_1_client_method(void); + const SSL_METHOD *TLSv1_method(void); + const SSL_METHOD *TLSv1_server_method(void); + const SSL_METHOD *TLSv1_client_method(void); + #ifndef OPENSSL_NO_SSL3_METHOD + const SSL_METHOD *SSLv3_method(void); + const SSL_METHOD *SSLv3_server_method(void); + const SSL_METHOD *SSLv3_client_method(void); + #endif + #ifndef OPENSSL_NO_SSL2 + const SSL_METHOD *SSLv2_method(void); + const SSL_METHOD *SSLv2_server_method(void); + const SSL_METHOD *SSLv2_client_method(void); + #endif + + const SSL_METHOD *DTLS_method(void); + const SSL_METHOD *DTLS_server_method(void); + const SSL_METHOD *DTLS_client_method(void); + const SSL_METHOD *DTLSv1_2_method(void); + const SSL_METHOD *DTLSv1_2_server_method(void); + const SSL_METHOD *DTLSv1_2_client_method(void); + const SSL_METHOD *DTLSv1_method(void); + const SSL_METHOD *DTLSv1_server_method(void); + const SSL_METHOD *DTLSv1_client_method(void); =head1 DESCRIPTION @@ -23,65 +65,88 @@ client only type. B can be of the following types: =over 4 -=item SSLv2_method(void), SSLv2_server_method(void), SSLv2_client_method(void) +=item SSLv23_method(), SSLv23_server_method(), SSLv23_client_method() + +These are the general-purpose I SSL/TLS methods. +The actual protocol version used will be negotiated to the highest version +mutually supported by the client and the server. +The supported protocols are SSLv2, SSLv3, TLSv1, TLSv1.1 and TLSv1.2. +Most applications should use these method, and avoid the version specific +methods described below. + +The list of protocols available can be further limited using the +B, B, B, +B and B options of the +L or L functions. +Clients should avoid creating "holes" in the set of protocols they support, +when disabling a protocol, make sure that you also disable either all previous +or all subsequent protocol versions. +In clients, when a protocol version is disabled without disabling I +previous protocol versions, the effect is to also disable all subsequent +protocol versions. + +The SSLv2 and SSLv3 protocols are deprecated and should generally not be used. +Applications should typically use L in combination with +the B flag to disable negotiation of SSLv3 via the above +I SSL/TLS methods. +The B option is set by default, and would need to be cleared +via L in order to enable negotiation of SSLv2. + +=item TLSv1_2_method(), TLSv1_2_server_method(), TLSv1_2_client_method() -A TLS/SSL connection established with these methods will only understand -the SSLv2 protocol. A client will send out SSLv2 client hello messages -and will also indicate that it only understand SSLv2. A server will only -understand SSLv2 client hello messages. +A TLS/SSL connection established with these methods will only understand the +TLSv1.2 protocol. A client will send out TLSv1.2 client hello messages and +will also indicate that it only understand TLSv1.2. A server will only +understand TLSv1.2 client hello messages. -=item SSLv3_method(void), SSLv3_server_method(void), SSLv3_client_method(void) +=item TLSv1_1_method(), TLSv1_1_server_method(), TLSv1_1_client_method() A TLS/SSL connection established with these methods will only understand the -SSLv3 protocol. A client will send out SSLv3 client hello messages -and will indicate that it only understands SSLv3. A server will only understand -SSLv3 client hello messages. This especially means, that it will -not understand SSLv2 client hello messages which are widely used for -compatibility reasons, see SSLv23_*_method(). +TLSv1.1 protocol. A client will send out TLSv1.1 client hello messages and +will also indicate that it only understand TLSv1.1. A server will only +understand TLSv1.1 client hello messages. -=item TLSv1_method(void), TLSv1_server_method(void), TLSv1_client_method(void) +=item TLSv1_method(), TLSv1_server_method(), TLSv1_client_method() A TLS/SSL connection established with these methods will only understand the -TLSv1 protocol. A client will send out TLSv1 client hello messages -and will indicate that it only understands TLSv1. A server will only understand -TLSv1 client hello messages. This especially means, that it will -not understand SSLv2 client hello messages which are widely used for -compatibility reasons, see SSLv23_*_method(). It will also not understand -SSLv3 client hello messages. - -=item SSLv23_method(void), SSLv23_server_method(void), SSLv23_client_method(void) - -A TLS/SSL connection established with these methods may understand the SSLv2, -SSLv3, TLSv1, TLSv1.1 and TLSv1.2 protocols. - -If the cipher list does not contain any SSLv2 ciphersuites (the default -cipher list does not) or extensions are required (for example server name) -a client will send out TLSv1 client hello messages including extensions and -will indicate that it also understands TLSv1.1, TLSv1.2 and permits a -fallback to SSLv3. A server will support SSLv3, TLSv1, TLSv1.1 and TLSv1.2 -protocols. This is the best choice when compatibility is a concern. - -If any SSLv2 ciphersuites are included in the cipher list and no extensions -are required then SSLv2 compatible client hellos will be used by clients and -SSLv2 will be accepted by servers. This is B recommended due to the -insecurity of SSLv2 and the limited nature of the SSLv2 client hello -prohibiting the use of extensions. +TLSv1 protocol. A client will send out TLSv1 client hello messages and will +indicate that it only understands TLSv1. A server will only understand TLSv1 +client hello messages. -=back +=item SSLv3_method(), SSLv3_server_method(), SSLv3_client_method() + +A TLS/SSL connection established with these methods will only understand the +SSLv3 protocol. A client will send out SSLv3 client hello messages and will +indicate that it only understands SSLv3. A server will only understand SSLv3 +client hello messages. The SSLv3 protocol is deprecated and should not be +used. + +=item SSLv2_method(), SSLv2_server_method(), SSLv2_client_method() + +A TLS/SSL connection established with these methods will only understand the +SSLv2 protocol. A client will send out SSLv2 client hello messages and will +also indicate that it only understand SSLv2. A server will only understand +SSLv2 client hello messages. The SSLv2 protocol offers little to no security +and should not be used. +As of OpenSSL 1.0.2g, EXPORT ciphers and 56-bit DES are no longer available +with SSLv2. -The list of protocols available can later be limited using the SSL_OP_NO_SSLv2, -SSL_OP_NO_SSLv3, SSL_OP_NO_TLSv1, SSL_OP_NO_TLSv1_1 and SSL_OP_NO_TLSv1_2 -options of the SSL_CTX_set_options() or SSL_set_options() functions. -Using these options it is possible to choose e.g. SSLv23_server_method() and -be able to negotiate with all possible clients, but to only allow newer -protocols like TLSv1, TLSv1.1 or TLS v1.2. +=item DTLS_method(), DTLS_server_method(), DTLS_client_method() -Applications which never want to support SSLv2 (even is the cipher string -is configured to use SSLv2 ciphersuites) can set SSL_OP_NO_SSLv2. +These are the version-flexible DTLS methods. + +=item DTLSv1_2_method(), DTLSv1_2_server_method(), DTLSv1_2_client_method() + +These are the version-specific methods for DTLSv1.2. + +=item DTLSv1_method(), DTLSv1_server_method(), DTLSv1_client_method() + +These are the version-specific methods for DTLSv1. + +=back -SSL_CTX_new() initializes the list of ciphers, the session cache setting, -the callbacks, the keys and certificates and the options to its default -values. +SSL_CTX_new() initializes the list of ciphers, the session cache setting, the +callbacks, the keys and certificates and the options to its default values. =head1 RETURN VALUES @@ -91,8 +156,8 @@ The following return values can occur: =item NULL -The creation of a new SSL_CTX object failed. Check the error stack to -find out the reason. +The creation of a new SSL_CTX object failed. Check the error stack to find out +the reason. =item Pointer to an SSL_CTX object @@ -102,6 +167,7 @@ The return value points to an allocated SSL_CTX object. =head1 SEE ALSO +L, L, L, L, L, L, L diff --git a/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod b/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod index e80a72cd4d06b1..9a7e98c1d4146d 100644 --- a/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod +++ b/deps/openssl/openssl/doc/ssl/SSL_CTX_set_options.pod @@ -189,15 +189,25 @@ browser has a cert, it will crash/hang. Works for 3.x and 4.xbeta =item SSL_OP_NO_SSLv2 Do not use the SSLv2 protocol. +As of OpenSSL 1.0.2g the B option is set by default. =item SSL_OP_NO_SSLv3 Do not use the SSLv3 protocol. +It is recommended that applications should set this option. =item SSL_OP_NO_TLSv1 Do not use the TLSv1 protocol. +=item SSL_OP_NO_TLSv1_1 + +Do not use the TLSv1.1 protocol. + +=item SSL_OP_NO_TLSv1_2 + +Do not use the TLSv1.2 protocol. + =item SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION When performing renegotiation as a server, always start a new session diff --git a/deps/openssl/openssl/doc/ssl/ssl.pod b/deps/openssl/openssl/doc/ssl/ssl.pod index 242087e691e35a..70cca178a20477 100644 --- a/deps/openssl/openssl/doc/ssl/ssl.pod +++ b/deps/openssl/openssl/doc/ssl/ssl.pod @@ -130,41 +130,86 @@ protocol methods defined in B structures. =over 4 -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); -Constructor for the SSLv2 SSL_METHOD structure for a dedicated client. +Constructor for the I SSL_METHOD structure for +clients, servers or both. +See L for details. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); -Constructor for the SSLv2 SSL_METHOD structure for a dedicated server. +Constructor for the I SSL_METHOD structure for +clients. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); -Constructor for the SSLv2 SSL_METHOD structure for combined client and server. +Constructor for the I SSL_METHOD structure for +servers. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); -Constructor for the SSLv3 SSL_METHOD structure for a dedicated client. +Constructor for the TLSv1.2 SSL_METHOD structure for clients, servers +or both. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); -Constructor for the SSLv3 SSL_METHOD structure for a dedicated server. +Constructor for the TLSv1.2 SSL_METHOD structure for clients. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); + +Constructor for the TLSv1.2 SSL_METHOD structure for servers. + +=item const SSL_METHOD *B(void); -Constructor for the SSLv3 SSL_METHOD structure for combined client and server. +Constructor for the TLSv1.1 SSL_METHOD structure for clients, servers +or both. + +=item const SSL_METHOD *B(void); + +Constructor for the TLSv1.1 SSL_METHOD structure for clients. + +=item const SSL_METHOD *B(void); + +Constructor for the TLSv1.1 SSL_METHOD structure for servers. + +=item const SSL_METHOD *B(void); + +Constructor for the TLSv1 SSL_METHOD structure for clients, servers +or both. =item const SSL_METHOD *B(void); -Constructor for the TLSv1 SSL_METHOD structure for a dedicated client. +Constructor for the TLSv1 SSL_METHOD structure for clients. =item const SSL_METHOD *B(void); -Constructor for the TLSv1 SSL_METHOD structure for a dedicated server. +Constructor for the TLSv1 SSL_METHOD structure for servers. -=item const SSL_METHOD *B(void); +=item const SSL_METHOD *B(void); + +Constructor for the SSLv3 SSL_METHOD structure for clients, servers +or both. + +=item const SSL_METHOD *B(void); + +Constructor for the SSLv3 SSL_METHOD structure for clients. + +=item const SSL_METHOD *B(void); + +Constructor for the SSLv3 SSL_METHOD structure for servers. + +=item const SSL_METHOD *B(void); + +Constructor for the SSLv2 SSL_METHOD structure for clients, servers +or both. + +=item const SSL_METHOD *B(void); + +Constructor for the SSLv2 SSL_METHOD structure for clients. + +=item const SSL_METHOD *B(void); -Constructor for the TLSv1 SSL_METHOD structure for combined client and server. +Constructor for the SSLv2 SSL_METHOD structure for servers. =back diff --git a/deps/openssl/openssl/engines/e_capi.c b/deps/openssl/openssl/engines/e_capi.c index f4cd2ffe7fa16d..6e524633f3f0f6 100644 --- a/deps/openssl/openssl/engines/e_capi.c +++ b/deps/openssl/openssl/engines/e_capi.c @@ -114,6 +114,26 @@ # define CERT_SYSTEM_STORE_CURRENT_USER 0x00010000 # endif +# ifndef ALG_SID_SHA_256 +# define ALG_SID_SHA_256 12 +# endif +# ifndef ALG_SID_SHA_384 +# define ALG_SID_SHA_384 13 +# endif +# ifndef ALG_SID_SHA_512 +# define ALG_SID_SHA_512 14 +# endif + +# ifndef CALG_SHA_256 +# define CALG_SHA_256 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_256) +# endif +# ifndef CALG_SHA_384 +# define CALG_SHA_384 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_384) +# endif +# ifndef CALG_SHA_512 +# define CALG_SHA_512 (ALG_CLASS_HASH | ALG_TYPE_ANY | ALG_SID_SHA_512) +# endif + # include # include # include @@ -800,6 +820,18 @@ int capi_rsa_sign(int dtype, const unsigned char *m, unsigned int m_len, } /* Convert the signature type to a CryptoAPI algorithm ID */ switch (dtype) { + case NID_sha256: + alg = CALG_SHA_256; + break; + + case NID_sha384: + alg = CALG_SHA_384; + break; + + case NID_sha512: + alg = CALG_SHA_512; + break; + case NID_sha1: alg = CALG_SHA1; break; diff --git a/deps/openssl/openssl/include/openssl/bio.h b/deps/openssl/openssl/include/openssl/bio.h index 6e2293bc66daad..6790aed28e0bae 100644 --- a/deps/openssl/openssl/include/openssl/bio.h +++ b/deps/openssl/openssl/include/openssl/bio.h @@ -479,7 +479,7 @@ struct bio_dgram_sctp_prinfo { # define BIO_get_conn_hostname(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,0) # define BIO_get_conn_port(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,1) # define BIO_get_conn_ip(b) BIO_ptr_ctrl(b,BIO_C_GET_CONNECT,2) -# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,0,NULL) +# define BIO_get_conn_int_port(b) BIO_ctrl(b,BIO_C_GET_CONNECT,3,NULL) # define BIO_set_nbio(b,n) BIO_ctrl(b,BIO_C_SET_NBIO,(n),NULL) @@ -689,7 +689,7 @@ long BIO_debug_callback(BIO *bio, int cmd, const char *argp, int argi, long argl, long ret); BIO_METHOD *BIO_s_mem(void); -BIO *BIO_new_mem_buf(void *buf, int len); +BIO *BIO_new_mem_buf(const void *buf, int len); BIO_METHOD *BIO_s_socket(void); BIO_METHOD *BIO_s_connect(void); BIO_METHOD *BIO_s_accept(void); diff --git a/deps/openssl/openssl/include/openssl/bn.h b/deps/openssl/openssl/include/openssl/bn.h index 5696965e9a09d0..86264ae6315fb1 100644 --- a/deps/openssl/openssl/include/openssl/bn.h +++ b/deps/openssl/openssl/include/openssl/bn.h @@ -125,6 +125,7 @@ #ifndef HEADER_BN_H # define HEADER_BN_H +# include # include # ifndef OPENSSL_NO_FP_API # include /* FILE */ @@ -721,8 +722,17 @@ const BIGNUM *BN_get0_nist_prime_521(void); /* library internal functions */ -# define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\ - (a):bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2)) +# define bn_expand(a,bits) \ + ( \ + bits > (INT_MAX - BN_BITS2 + 1) ? \ + NULL \ + : \ + (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax) ? \ + (a) \ + : \ + bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2) \ + ) + # define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words))) BIGNUM *bn_expand2(BIGNUM *a, int words); # ifndef OPENSSL_NO_DEPRECATED diff --git a/deps/openssl/openssl/include/openssl/crypto.h b/deps/openssl/openssl/include/openssl/crypto.h index c450d7a3c374b3..6c644ce12a8250 100644 --- a/deps/openssl/openssl/include/openssl/crypto.h +++ b/deps/openssl/openssl/include/openssl/crypto.h @@ -628,7 +628,7 @@ void OPENSSL_init(void); * into a defined order as the return value when a != b is undefined, other * than to be non-zero. */ -int CRYPTO_memcmp(const void *a, const void *b, size_t len); +int CRYPTO_memcmp(const volatile void *a, const volatile void *b, size_t len); /* BEGIN ERROR CODES */ /* diff --git a/deps/openssl/openssl/include/openssl/dh.h b/deps/openssl/openssl/include/openssl/dh.h index 5498a9dc106044..a5bd9016aae85a 100644 --- a/deps/openssl/openssl/include/openssl/dh.h +++ b/deps/openssl/openssl/include/openssl/dh.h @@ -174,7 +174,7 @@ struct dh_st { /* DH_check_pub_key error codes */ # define DH_CHECK_PUBKEY_TOO_SMALL 0x01 # define DH_CHECK_PUBKEY_TOO_LARGE 0x02 -# define DH_CHECK_PUBKEY_INVALID 0x03 +# define DH_CHECK_PUBKEY_INVALID 0x04 /* * primes p where (p-1)/2 is prime too are called "safe"; we define this for diff --git a/deps/openssl/openssl/include/openssl/opensslv.h b/deps/openssl/openssl/include/openssl/opensslv.h index 03b8c4843784a7..4334fd15cd8739 100644 --- a/deps/openssl/openssl/include/openssl/opensslv.h +++ b/deps/openssl/openssl/include/openssl/opensslv.h @@ -30,11 +30,11 @@ extern "C" { * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for * major minor fix final patch/beta) */ -# define OPENSSL_VERSION_NUMBER 0x1000206fL +# define OPENSSL_VERSION_NUMBER 0x1000207fL # ifdef OPENSSL_FIPS -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f-fips 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g-fips 1 Mar 2016" # else -# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2f 28 Jan 2016" +# define OPENSSL_VERSION_TEXT "OpenSSL 1.0.2g 1 Mar 2016" # endif # define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT diff --git a/deps/openssl/openssl/include/openssl/srp.h b/deps/openssl/openssl/include/openssl/srp.h index d072536fec9bb3..028892a1ff5e04 100644 --- a/deps/openssl/openssl/include/openssl/srp.h +++ b/deps/openssl/openssl/include/openssl/srp.h @@ -82,16 +82,21 @@ typedef struct SRP_gN_cache_st { DECLARE_STACK_OF(SRP_gN_cache) typedef struct SRP_user_pwd_st { + /* Owned by us. */ char *id; BIGNUM *s; BIGNUM *v; + /* Not owned by us. */ const BIGNUM *g; const BIGNUM *N; + /* Owned by us. */ char *info; } SRP_user_pwd; DECLARE_STACK_OF(SRP_user_pwd) +void SRP_user_pwd_free(SRP_user_pwd *user_pwd); + typedef struct SRP_VBASE_st { STACK_OF(SRP_user_pwd) *users_pwd; STACK_OF(SRP_gN_cache) *gN_cache; @@ -115,7 +120,12 @@ DECLARE_STACK_OF(SRP_gN) SRP_VBASE *SRP_VBASE_new(char *seed_key); int SRP_VBASE_free(SRP_VBASE *vb); int SRP_VBASE_init(SRP_VBASE *vb, char *verifier_file); + +/* This method ignores the configured seed and fails for an unknown user. */ SRP_user_pwd *SRP_VBASE_get_by_user(SRP_VBASE *vb, char *username); +/* NOTE: unlike in SRP_VBASE_get_by_user, caller owns the returned pointer.*/ +SRP_user_pwd *SRP_VBASE_get1_by_user(SRP_VBASE *vb, char *username); + char *SRP_create_verifier(const char *user, const char *pass, char **salt, char **verifier, const char *N, const char *g); int SRP_create_verifier_BN(const char *user, const char *pass, BIGNUM **salt, diff --git a/deps/openssl/openssl/include/openssl/ssl.h b/deps/openssl/openssl/include/openssl/ssl.h index ae8c92575e03e0..04d4007eeb8e5a 100644 --- a/deps/openssl/openssl/include/openssl/ssl.h +++ b/deps/openssl/openssl/include/openssl/ssl.h @@ -2713,7 +2713,6 @@ void ERR_load_SSL_strings(void); # define SSL_F_SSL3_SETUP_KEY_BLOCK 157 # define SSL_F_SSL3_SETUP_READ_BUFFER 156 # define SSL_F_SSL3_SETUP_WRITE_BUFFER 291 -# define SSL_F_SSL3_SHUTDOWN 396 # define SSL_F_SSL3_WRITE_BYTES 158 # define SSL_F_SSL3_WRITE_PENDING 159 # define SSL_F_SSL_ADD_CERT_CHAIN 318 diff --git a/deps/openssl/openssl/ms/uplink-x86.pl b/deps/openssl/openssl/ms/uplink-x86.pl index 0dffc14fcd2ead..53b998d2708a9b 100755 --- a/deps/openssl/openssl/ms/uplink-x86.pl +++ b/deps/openssl/openssl/ms/uplink-x86.pl @@ -14,11 +14,11 @@ for ($i=1;$i<=$N;$i++) { &function_begin_B("_\$lazy${i}"); &lea ("eax",&DWP(&label("OPENSSL_UplinkTable"))); - &push ("eax"); &push ($i); + &push ("eax"); &call (&label("OPENSSL_Uplink")); - &add ("esp",8); &pop ("eax"); + &add ("esp",4); &jmp_ptr(&DWP(4*$i,"eax")); &function_end_B("_\$lazy${i}"); } diff --git a/deps/openssl/openssl/openssl.spec b/deps/openssl/openssl/openssl.spec index 72ace12c4440e3..67fb0735e2bf76 100644 --- a/deps/openssl/openssl/openssl.spec +++ b/deps/openssl/openssl/openssl.spec @@ -6,7 +6,7 @@ Release: 1 Summary: Secure Sockets Layer and cryptography libraries and tools Name: openssl -Version: 1.0.2f +Version: 1.0.2g Source0: ftp://ftp.openssl.org/source/%{name}-%{version}.tar.gz License: OpenSSL Group: System Environment/Libraries diff --git a/deps/openssl/openssl/ssl/Makefile b/deps/openssl/openssl/ssl/Makefile index 7b90fb037550af..b6dee5b5ea522a 100644 --- a/deps/openssl/openssl/ssl/Makefile +++ b/deps/openssl/openssl/ssl/Makefile @@ -15,7 +15,7 @@ KRB5_INCLUDES= CFLAGS= $(INCLUDES) $(CFLAG) GENERAL=Makefile README ssl-lib.com install.com -TEST=ssltest.c heartbeat_test.c clienthellotest.c +TEST=ssltest.c heartbeat_test.c clienthellotest.c sslv2conftest.c APPS= LIB=$(TOP)/libssl.a @@ -399,14 +399,14 @@ s2_clnt.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h s2_clnt.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h s2_clnt.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h s2_clnt.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h -s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rand.h -s2_clnt.o: ../include/openssl/rsa.h ../include/openssl/safestack.h -s2_clnt.o: ../include/openssl/sha.h ../include/openssl/srtp.h -s2_clnt.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h -s2_clnt.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h -s2_clnt.o: ../include/openssl/stack.h ../include/openssl/symhacks.h -s2_clnt.o: ../include/openssl/tls1.h ../include/openssl/x509.h -s2_clnt.o: ../include/openssl/x509_vfy.h s2_clnt.c ssl_locl.h +s2_clnt.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h +s2_clnt.o: ../include/openssl/safestack.h ../include/openssl/sha.h +s2_clnt.o: ../include/openssl/srtp.h ../include/openssl/ssl.h +s2_clnt.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h +s2_clnt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h +s2_clnt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h +s2_clnt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_clnt.c +s2_clnt.o: ssl_locl.h s2_enc.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h s2_enc.o: ../include/openssl/buffer.h ../include/openssl/comp.h s2_enc.o: ../include/openssl/crypto.h ../include/openssl/dsa.h @@ -435,18 +435,18 @@ s2_lib.o: ../include/openssl/ec.h ../include/openssl/ecdh.h s2_lib.o: ../include/openssl/ecdsa.h ../include/openssl/err.h s2_lib.o: ../include/openssl/evp.h ../include/openssl/hmac.h s2_lib.o: ../include/openssl/kssl.h ../include/openssl/lhash.h -s2_lib.o: ../include/openssl/md5.h ../include/openssl/obj_mac.h -s2_lib.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h -s2_lib.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h -s2_lib.o: ../include/openssl/pem.h ../include/openssl/pem2.h -s2_lib.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h -s2_lib.o: ../include/openssl/rsa.h ../include/openssl/safestack.h -s2_lib.o: ../include/openssl/sha.h ../include/openssl/srtp.h -s2_lib.o: ../include/openssl/ssl.h ../include/openssl/ssl2.h -s2_lib.o: ../include/openssl/ssl23.h ../include/openssl/ssl3.h -s2_lib.o: ../include/openssl/stack.h ../include/openssl/symhacks.h -s2_lib.o: ../include/openssl/tls1.h ../include/openssl/x509.h -s2_lib.o: ../include/openssl/x509_vfy.h s2_lib.c ssl_locl.h +s2_lib.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h +s2_lib.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h +s2_lib.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h +s2_lib.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h +s2_lib.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h +s2_lib.o: ../include/openssl/safestack.h ../include/openssl/sha.h +s2_lib.o: ../include/openssl/srtp.h ../include/openssl/ssl.h +s2_lib.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h +s2_lib.o: ../include/openssl/ssl3.h ../include/openssl/stack.h +s2_lib.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h +s2_lib.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_lib.c +s2_lib.o: ssl_locl.h s2_meth.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h s2_meth.o: ../include/openssl/buffer.h ../include/openssl/comp.h s2_meth.o: ../include/openssl/crypto.h ../include/openssl/dsa.h @@ -487,20 +487,19 @@ s2_pkt.o: ../include/openssl/ssl3.h ../include/openssl/stack.h s2_pkt.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h s2_pkt.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h s2_pkt.c s2_pkt.o: ssl_locl.h -s2_srvr.o: ../crypto/constant_time_locl.h ../e_os.h ../include/openssl/asn1.h -s2_srvr.o: ../include/openssl/bio.h ../include/openssl/buffer.h -s2_srvr.o: ../include/openssl/comp.h ../include/openssl/crypto.h -s2_srvr.o: ../include/openssl/dsa.h ../include/openssl/dtls1.h -s2_srvr.o: ../include/openssl/e_os2.h ../include/openssl/ec.h -s2_srvr.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h -s2_srvr.o: ../include/openssl/err.h ../include/openssl/evp.h -s2_srvr.o: ../include/openssl/hmac.h ../include/openssl/kssl.h -s2_srvr.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h -s2_srvr.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h -s2_srvr.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h -s2_srvr.o: ../include/openssl/pem.h ../include/openssl/pem2.h -s2_srvr.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h -s2_srvr.o: ../include/openssl/rand.h ../include/openssl/rsa.h +s2_srvr.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h +s2_srvr.o: ../include/openssl/buffer.h ../include/openssl/comp.h +s2_srvr.o: ../include/openssl/crypto.h ../include/openssl/dsa.h +s2_srvr.o: ../include/openssl/dtls1.h ../include/openssl/e_os2.h +s2_srvr.o: ../include/openssl/ec.h ../include/openssl/ecdh.h +s2_srvr.o: ../include/openssl/ecdsa.h ../include/openssl/err.h +s2_srvr.o: ../include/openssl/evp.h ../include/openssl/hmac.h +s2_srvr.o: ../include/openssl/kssl.h ../include/openssl/lhash.h +s2_srvr.o: ../include/openssl/obj_mac.h ../include/openssl/objects.h +s2_srvr.o: ../include/openssl/opensslconf.h ../include/openssl/opensslv.h +s2_srvr.o: ../include/openssl/ossl_typ.h ../include/openssl/pem.h +s2_srvr.o: ../include/openssl/pem2.h ../include/openssl/pkcs7.h +s2_srvr.o: ../include/openssl/pqueue.h ../include/openssl/rsa.h s2_srvr.o: ../include/openssl/safestack.h ../include/openssl/sha.h s2_srvr.o: ../include/openssl/srtp.h ../include/openssl/ssl.h s2_srvr.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h diff --git a/deps/openssl/openssl/ssl/s2_lib.c b/deps/openssl/openssl/ssl/s2_lib.c index d55b93f76bb7f8..a8036b357f0e93 100644 --- a/deps/openssl/openssl/ssl/s2_lib.c +++ b/deps/openssl/openssl/ssl/s2_lib.c @@ -156,6 +156,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { 128, }, +# if 0 /* RC4_128_EXPORT40_WITH_MD5 */ { 1, @@ -171,6 +172,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { 40, 128, }, +# endif /* RC2_128_CBC_WITH_MD5 */ { @@ -188,6 +190,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { 128, }, +# if 0 /* RC2_128_CBC_EXPORT40_WITH_MD5 */ { 1, @@ -203,6 +206,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { 40, 128, }, +# endif # ifndef OPENSSL_NO_IDEA /* IDEA_128_CBC_WITH_MD5 */ @@ -222,6 +226,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { }, # endif +# if 0 /* DES_64_CBC_WITH_MD5 */ { 1, @@ -237,6 +242,7 @@ OPENSSL_GLOBAL const SSL_CIPHER ssl2_ciphers[] = { 56, 56, }, +# endif /* DES_192_EDE3_CBC_WITH_MD5 */ { diff --git a/deps/openssl/openssl/ssl/s3_lib.c b/deps/openssl/openssl/ssl/s3_lib.c index f846cb5b7b0199..4aac3b27928045 100644 --- a/deps/openssl/openssl/ssl/s3_lib.c +++ b/deps/openssl/openssl/ssl/s3_lib.c @@ -198,6 +198,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 03 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_RSA_RC4_40_MD5, @@ -212,6 +213,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +#endif /* Cipher 04 */ { @@ -246,6 +248,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 06 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_RSA_RC2_40_MD5, @@ -260,6 +263,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +#endif /* Cipher 07 */ #ifndef OPENSSL_NO_IDEA @@ -280,6 +284,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { #endif /* Cipher 08 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_RSA_DES_40_CBC_SHA, @@ -294,8 +299,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +#endif /* Cipher 09 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_RSA_DES_64_CBC_SHA, @@ -310,6 +317,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 0A */ { @@ -329,6 +337,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { /* The DH ciphers */ /* Cipher 0B */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 0, SSL3_TXT_DH_DSS_DES_40_CBC_SHA, @@ -343,8 +352,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +#endif /* Cipher 0C */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_DH_DSS_DES_64_CBC_SHA, @@ -359,6 +370,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 0D */ { @@ -377,6 +389,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 0E */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 0, SSL3_TXT_DH_RSA_DES_40_CBC_SHA, @@ -391,8 +404,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +#endif /* Cipher 0F */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_DH_RSA_DES_64_CBC_SHA, @@ -407,6 +422,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 10 */ { @@ -426,6 +442,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { /* The Ephemeral DH ciphers */ /* Cipher 11 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_EDH_DSS_DES_40_CBC_SHA, @@ -440,8 +457,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +#endif /* Cipher 12 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_EDH_DSS_DES_64_CBC_SHA, @@ -456,6 +475,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 13 */ { @@ -474,6 +494,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 14 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_EDH_RSA_DES_40_CBC_SHA, @@ -488,8 +509,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +#endif /* Cipher 15 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_EDH_RSA_DES_64_CBC_SHA, @@ -504,6 +527,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 16 */ { @@ -522,6 +546,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 17 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_ADH_RC4_40_MD5, @@ -536,6 +561,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +#endif /* Cipher 18 */ { @@ -554,6 +580,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 19 */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_ADH_DES_40_CBC_SHA, @@ -568,8 +595,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +#endif /* Cipher 1A */ +#ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_ADH_DES_64_CBC_SHA, @@ -584,6 +613,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +#endif /* Cipher 1B */ { @@ -655,6 +685,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { #ifndef OPENSSL_NO_KRB5 /* The Kerberos ciphers*/ /* Cipher 1E */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_DES_64_CBC_SHA, @@ -669,6 +700,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +# endif /* Cipher 1F */ { @@ -719,6 +751,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 22 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_DES_64_CBC_MD5, @@ -733,6 +766,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +# endif /* Cipher 23 */ { @@ -783,6 +817,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { }, /* Cipher 26 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_DES_40_CBC_SHA, @@ -797,8 +832,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +# endif /* Cipher 27 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_RC2_40_CBC_SHA, @@ -813,8 +850,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +# endif /* Cipher 28 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_RC4_40_SHA, @@ -829,8 +868,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +# endif /* Cipher 29 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_DES_40_CBC_MD5, @@ -845,8 +886,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 56, }, +# endif /* Cipher 2A */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_RC2_40_CBC_MD5, @@ -861,8 +904,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +# endif /* Cipher 2B */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, SSL3_TXT_KRB5_RC4_40_MD5, @@ -877,6 +922,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 40, 128, }, +# endif #endif /* OPENSSL_NO_KRB5 */ /* New AES ciphersuites */ @@ -1300,6 +1346,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { # endif /* Cipher 62 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, TLS1_TXT_RSA_EXPORT1024_WITH_DES_CBC_SHA, @@ -1314,8 +1361,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +# endif /* Cipher 63 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, TLS1_TXT_DHE_DSS_EXPORT1024_WITH_DES_CBC_SHA, @@ -1330,8 +1379,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 56, }, +# endif /* Cipher 64 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, TLS1_TXT_RSA_EXPORT1024_WITH_RC4_56_SHA, @@ -1346,8 +1397,10 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 128, }, +# endif /* Cipher 65 */ +# ifndef OPENSSL_NO_WEAK_SSL_CIPHERS { 1, TLS1_TXT_DHE_DSS_EXPORT1024_WITH_RC4_56_SHA, @@ -1362,6 +1415,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = { 56, 128, }, +# endif /* Cipher 66 */ { @@ -4326,21 +4380,6 @@ int ssl3_shutdown(SSL *s) } #endif } else if (!(s->shutdown & SSL_RECEIVED_SHUTDOWN)) { - if (SSL_in_init(s)) { - /* - * We can't shutdown properly if we are in the middle of a - * handshake. Doing so is problematic because the peer may send a - * CCS before it acts on our close_notify. However we should not - * continue to process received handshake messages or CCS once our - * close_notify has been sent. Therefore any close_notify from - * the peer will be unreadable because we have not moved to the next - * cipher state. Its best just to avoid this can-of-worms. Return - * an error if we are wanting to wait for a close_notify from the - * peer and we are in init. - */ - SSLerr(SSL_F_SSL3_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT); - return -1; - } /* * If we are waiting for a close from our peer, we are closed */ diff --git a/deps/openssl/openssl/ssl/ssl.h b/deps/openssl/openssl/ssl/ssl.h index ae8c92575e03e0..04d4007eeb8e5a 100644 --- a/deps/openssl/openssl/ssl/ssl.h +++ b/deps/openssl/openssl/ssl/ssl.h @@ -2713,7 +2713,6 @@ void ERR_load_SSL_strings(void); # define SSL_F_SSL3_SETUP_KEY_BLOCK 157 # define SSL_F_SSL3_SETUP_READ_BUFFER 156 # define SSL_F_SSL3_SETUP_WRITE_BUFFER 291 -# define SSL_F_SSL3_SHUTDOWN 396 # define SSL_F_SSL3_WRITE_BYTES 158 # define SSL_F_SSL3_WRITE_PENDING 159 # define SSL_F_SSL_ADD_CERT_CHAIN 318 diff --git a/deps/openssl/openssl/ssl/ssl_conf.c b/deps/openssl/openssl/ssl/ssl_conf.c index 5478840deae994..8d3709d2b62c99 100644 --- a/deps/openssl/openssl/ssl/ssl_conf.c +++ b/deps/openssl/openssl/ssl/ssl_conf.c @@ -330,11 +330,19 @@ static int cmd_Protocol(SSL_CONF_CTX *cctx, const char *value) SSL_FLAG_TBL_INV("TLSv1.1", SSL_OP_NO_TLSv1_1), SSL_FLAG_TBL_INV("TLSv1.2", SSL_OP_NO_TLSv1_2) }; + int ret; + int sslv2off; + if (!(cctx->flags & SSL_CONF_FLAG_FILE)) return -2; cctx->tbl = ssl_protocol_list; cctx->ntbl = sizeof(ssl_protocol_list) / sizeof(ssl_flag_tbl); - return CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx); + + sslv2off = *cctx->poptions & SSL_OP_NO_SSLv2; + ret = CONF_parse_list(value, ',', 1, ssl_set_option_list, cctx); + /* Never turn on SSLv2 through configuration */ + *cctx->poptions |= sslv2off; + return ret; } static int cmd_Options(SSL_CONF_CTX *cctx, const char *value) diff --git a/deps/openssl/openssl/ssl/ssl_err.c b/deps/openssl/openssl/ssl/ssl_err.c index dd3b2afd1ea6b7..704088dc469ed9 100644 --- a/deps/openssl/openssl/ssl/ssl_err.c +++ b/deps/openssl/openssl/ssl/ssl_err.c @@ -206,7 +206,6 @@ static ERR_STRING_DATA SSL_str_functs[] = { {ERR_FUNC(SSL_F_SSL3_SETUP_KEY_BLOCK), "ssl3_setup_key_block"}, {ERR_FUNC(SSL_F_SSL3_SETUP_READ_BUFFER), "ssl3_setup_read_buffer"}, {ERR_FUNC(SSL_F_SSL3_SETUP_WRITE_BUFFER), "ssl3_setup_write_buffer"}, - {ERR_FUNC(SSL_F_SSL3_SHUTDOWN), "ssl3_shutdown"}, {ERR_FUNC(SSL_F_SSL3_WRITE_BYTES), "ssl3_write_bytes"}, {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING), "ssl3_write_pending"}, {ERR_FUNC(SSL_F_SSL_ADD_CERT_CHAIN), "ssl_add_cert_chain"}, diff --git a/deps/openssl/openssl/ssl/ssl_lib.c b/deps/openssl/openssl/ssl/ssl_lib.c index 2744be8ad8ce67..f1279bbf910343 100644 --- a/deps/openssl/openssl/ssl/ssl_lib.c +++ b/deps/openssl/openssl/ssl/ssl_lib.c @@ -1060,7 +1060,12 @@ int SSL_shutdown(SSL *s) return -1; } - return s->method->ssl_shutdown(s); + if (!SSL_in_init(s)) { + return s->method->ssl_shutdown(s); + } else { + SSLerr(SSL_F_SSL_SHUTDOWN, SSL_R_SHUTDOWN_WHILE_IN_INIT); + return -1; + } } int SSL_renegotiate(SSL *s) @@ -2049,6 +2054,13 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth) */ ret->options |= SSL_OP_LEGACY_SERVER_CONNECT; + /* + * Disable SSLv2 by default, callers that want to enable SSLv2 will have to + * explicitly clear this option via either of SSL_CTX_clear_options() or + * SSL_clear_options(). + */ + ret->options |= SSL_OP_NO_SSLv2; + return (ret); err: SSLerr(SSL_F_SSL_CTX_NEW, ERR_R_MALLOC_FAILURE); diff --git a/deps/openssl/openssl/ssl/sslv2conftest.c b/deps/openssl/openssl/ssl/sslv2conftest.c new file mode 100644 index 00000000000000..1fd748b11866e0 --- /dev/null +++ b/deps/openssl/openssl/ssl/sslv2conftest.c @@ -0,0 +1,231 @@ +/* Written by Matt Caswell for the OpenSSL Project */ +/* ==================================================================== + * Copyright (c) 2016 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include +#include +#include +#include + + +#define TOTAL_NUM_TESTS 2 +#define TEST_SSL_CTX 0 + +#define SSLV2ON 1 +#define SSLV2OFF 0 + +SSL_CONF_CTX *confctx; +SSL_CTX *ctx; +SSL *ssl; + +static int checksslv2(int test, int sslv2) +{ + int options; + if (test == TEST_SSL_CTX) { + options = SSL_CTX_get_options(ctx); + } else { + options = SSL_get_options(ssl); + } + return ((options & SSL_OP_NO_SSLv2) == 0) ^ (sslv2 == SSLV2OFF); +} + +int main(int argc, char *argv[]) +{ + BIO *err; + int testresult = 0; + int currtest; + + SSL_library_init(); + SSL_load_error_strings(); + + err = BIO_new_fp(stderr, BIO_NOCLOSE | BIO_FP_TEXT); + + CRYPTO_malloc_debug_init(); + CRYPTO_set_mem_debug_options(V_CRYPTO_MDEBUG_ALL); + CRYPTO_mem_ctrl(CRYPTO_MEM_CHECK_ON); + + + confctx = SSL_CONF_CTX_new(); + ctx = SSL_CTX_new(SSLv23_method()); + ssl = SSL_new(ctx); + if (confctx == NULL || ctx == NULL) + goto end; + + SSL_CONF_CTX_set_flags(confctx, SSL_CONF_FLAG_FILE + | SSL_CONF_FLAG_CLIENT + | SSL_CONF_FLAG_SERVER); + + /* + * For each test set up an SSL_CTX and SSL and see whether SSLv2 is enabled + * as expected after various SSL_CONF_cmd("Protocol", ...) calls. + */ + for (currtest = 0; currtest < TOTAL_NUM_TESTS; currtest++) { + BIO_printf(err, "SSLv2 CONF Test number %d\n", currtest); + if (currtest == TEST_SSL_CTX) + SSL_CONF_CTX_set_ssl_ctx(confctx, ctx); + else + SSL_CONF_CTX_set_ssl(confctx, ssl); + + /* SSLv2 should be off by default */ + if (!checksslv2(currtest, SSLV2OFF)) { + BIO_printf(err, "SSLv2 CONF Test: Off by default test FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n"); + goto end; + } + + /* Should still be off even after ALL Protocols on */ + if (!checksslv2(currtest, SSLV2OFF)) { + BIO_printf(err, "SSLv2 CONF Test: Off after config #1 FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n"); + goto end; + } + + /* Should still be off even if explicitly asked for */ + if (!checksslv2(currtest, SSLV2OFF)) { + BIO_printf(err, "SSLv2 CONF Test: Off after config #2 FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n");; + goto end; + } + + if (!checksslv2(currtest, SSLV2OFF)) { + BIO_printf(err, "SSLv2 CONF Test: Off after config #3 FAIL\n"); + goto end; + } + + if (currtest == TEST_SSL_CTX) + SSL_CTX_clear_options(ctx, SSL_OP_NO_SSLv2); + else + SSL_clear_options(ssl, SSL_OP_NO_SSLv2); + + if (!checksslv2(currtest, SSLV2ON)) { + BIO_printf(err, "SSLv2 CONF Test: On after clear FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "ALL") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n"); + goto end; + } + + /* Option has been cleared and config says have SSLv2 so should be on */ + if (!checksslv2(currtest, SSLV2ON)) { + BIO_printf(err, "SSLv2 CONF Test: On after config #1 FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "SSLv2") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n"); + goto end; + } + + /* Option has been cleared and config says have SSLv2 so should be on */ + if (!checksslv2(currtest, SSLV2ON)) { + BIO_printf(err, "SSLv2 CONF Test: On after config #2 FAIL\n"); + goto end; + } + + if (SSL_CONF_cmd(confctx, "Protocol", "-SSLv2") != 2 + || !SSL_CONF_CTX_finish(confctx)) { + BIO_printf(err, "SSLv2 CONF Test: SSL_CONF command FAIL\n"); + goto end; + } + + /* Option has been cleared but config says no SSLv2 so should be off */ + if (!checksslv2(currtest, SSLV2OFF)) { + BIO_printf(err, "SSLv2 CONF Test: Off after config #4 FAIL\n"); + goto end; + } + + } + + testresult = 1; + + end: + SSL_free(ssl); + SSL_CTX_free(ctx); + SSL_CONF_CTX_free(confctx); + + if (!testresult) { + printf("SSLv2 CONF test: FAILED (Test %d)\n", currtest); + ERR_print_errors(err); + } else { + printf("SSLv2 CONF test: PASSED\n"); + } + + ERR_free_strings(); + ERR_remove_thread_state(NULL); + EVP_cleanup(); + CRYPTO_cleanup_all_ex_data(); + CRYPTO_mem_leaks(err); + BIO_free(err); + + return testresult ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/deps/openssl/openssl/test/Makefile b/deps/openssl/openssl/test/Makefile index b180971b28e729..e566babfa59d42 100644 --- a/deps/openssl/openssl/test/Makefile +++ b/deps/openssl/openssl/test/Makefile @@ -70,6 +70,7 @@ HEARTBEATTEST= heartbeat_test CONSTTIMETEST= constant_time_test VERIFYEXTRATEST= verify_extra_test CLIENTHELLOTEST= clienthellotest +SSLV2CONFTEST = sslv2conftest TESTS= alltests @@ -83,7 +84,7 @@ EXE= $(BNTEST)$(EXE_EXT) $(ECTEST)$(EXE_EXT) $(ECDSATEST)$(EXE_EXT) $(ECDHTEST) $(EVPTEST)$(EXE_EXT) $(EVPEXTRATEST)$(EXE_EXT) $(IGETEST)$(EXE_EXT) $(JPAKETEST)$(EXE_EXT) $(SRPTEST)$(EXE_EXT) \ $(ASN1TEST)$(EXE_EXT) $(V3NAMETEST)$(EXE_EXT) $(HEARTBEATTEST)$(EXE_EXT) \ $(CONSTTIMETEST)$(EXE_EXT) $(VERIFYEXTRATEST)$(EXE_EXT) \ - $(CLIENTHELLOTEST)$(EXE_EXT) + $(CLIENTHELLOTEST)$(EXE_EXT) $(SSLV2CONFTEST)$(EXE_EXT) # $(METHTEST)$(EXE_EXT) @@ -97,7 +98,7 @@ OBJ= $(BNTEST).o $(ECTEST).o $(ECDSATEST).o $(ECDHTEST).o $(IDEATEST).o \ $(BFTEST).o $(SSLTEST).o $(DSATEST).o $(EXPTEST).o $(RSATEST).o \ $(EVPTEST).o $(EVPEXTRATEST).o $(IGETEST).o $(JPAKETEST).o $(ASN1TEST).o $(V3NAMETEST).o \ $(HEARTBEATTEST).o $(CONSTTIMETEST).o $(VERIFYEXTRATEST).o \ - $(CLIENTHELLOTEST).o + $(CLIENTHELLOTEST).o $(SSLV2CONFTEST).o SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ $(MD2TEST).c $(MD4TEST).c $(MD5TEST).c \ @@ -108,7 +109,7 @@ SRC= $(BNTEST).c $(ECTEST).c $(ECDSATEST).c $(ECDHTEST).c $(IDEATEST).c \ $(BFTEST).c $(SSLTEST).c $(DSATEST).c $(EXPTEST).c $(RSATEST).c \ $(EVPTEST).c $(EVPEXTRATEST).c $(IGETEST).c $(JPAKETEST).c $(SRPTEST).c $(ASN1TEST).c \ $(V3NAMETEST).c $(HEARTBEATTEST).c $(CONSTTIMETEST).c $(VERIFYEXTRATEST).c \ - $(CLIENTHELLOTEST).c + $(CLIENTHELLOTEST).c $(SSLV2CONFTEST).c EXHEADER= HEADER= testutil.h $(EXHEADER) @@ -152,7 +153,7 @@ alltests: \ test_gen test_req test_pkcs7 test_verify test_dh test_dsa \ test_ss test_ca test_engine test_evp test_evp_extra test_ssl test_tsa test_ige \ test_jpake test_srp test_cms test_ocsp test_v3name test_heartbeat \ - test_constant_time test_verify_extra test_clienthello + test_constant_time test_verify_extra test_clienthello test_sslv2conftest test_evp: $(EVPTEST)$(EXE_EXT) evptests.txt ../util/shlib_wrap.sh ./$(EVPTEST) evptests.txt @@ -361,6 +362,10 @@ test_clienthello: $(CLIENTHELLOTEST)$(EXE_EXT) @echo $(START) $@ ../util/shlib_wrap.sh ./$(CLIENTHELLOTEST) +test_sslv2conftest: $(SSLV2CONFTEST)$(EXE_EXT) + @echo $(START) $@ + ../util/shlib_wrap.sh ./$(SSLV2CONFTEST) + lint: lint -DLINT $(INCLUDES) $(SRC)>fluff @@ -538,6 +543,9 @@ $(VERIFYEXTRATEST)$(EXE_EXT): $(VERIFYEXTRATEST).o $(CLIENTHELLOTEST)$(EXE_EXT): $(CLIENTHELLOTEST).o @target=$(CLIENTHELLOTEST) $(BUILD_CMD) +$(SSLV2CONFTEST)$(EXE_EXT): $(SSLV2CONFTEST).o + @target=$(SSLV2CONFTEST) $(BUILD_CMD) + #$(AESTEST).o: $(AESTEST).c # $(CC) -c $(CFLAGS) -DINTERMEDIATE_VALUE_KAT -DTRACE_KAT_MCT $(AESTEST).c @@ -848,6 +856,25 @@ ssltest.o: ../include/openssl/ssl3.h ../include/openssl/stack.h ssltest.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h ssltest.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h ssltest.o: ../include/openssl/x509v3.h ssltest.c +sslv2conftest.o: ../include/openssl/asn1.h ../include/openssl/bio.h +sslv2conftest.o: ../include/openssl/buffer.h ../include/openssl/comp.h +sslv2conftest.o: ../include/openssl/crypto.h ../include/openssl/dtls1.h +sslv2conftest.o: ../include/openssl/e_os2.h ../include/openssl/ec.h +sslv2conftest.o: ../include/openssl/ecdh.h ../include/openssl/ecdsa.h +sslv2conftest.o: ../include/openssl/err.h ../include/openssl/evp.h +sslv2conftest.o: ../include/openssl/hmac.h ../include/openssl/kssl.h +sslv2conftest.o: ../include/openssl/lhash.h ../include/openssl/obj_mac.h +sslv2conftest.o: ../include/openssl/objects.h ../include/openssl/opensslconf.h +sslv2conftest.o: ../include/openssl/opensslv.h ../include/openssl/ossl_typ.h +sslv2conftest.o: ../include/openssl/pem.h ../include/openssl/pem2.h +sslv2conftest.o: ../include/openssl/pkcs7.h ../include/openssl/pqueue.h +sslv2conftest.o: ../include/openssl/safestack.h ../include/openssl/sha.h +sslv2conftest.o: ../include/openssl/srtp.h ../include/openssl/ssl.h +sslv2conftest.o: ../include/openssl/ssl2.h ../include/openssl/ssl23.h +sslv2conftest.o: ../include/openssl/ssl3.h ../include/openssl/stack.h +sslv2conftest.o: ../include/openssl/symhacks.h ../include/openssl/tls1.h +sslv2conftest.o: ../include/openssl/x509.h ../include/openssl/x509_vfy.h +sslv2conftest.o: sslv2conftest.c v3nametest.o: ../e_os.h ../include/openssl/asn1.h ../include/openssl/bio.h v3nametest.o: ../include/openssl/buffer.h ../include/openssl/conf.h v3nametest.o: ../include/openssl/crypto.h ../include/openssl/e_os2.h diff --git a/deps/openssl/openssl/util/libeay.num b/deps/openssl/openssl/util/libeay.num index 7f7487df504428..e5b3c6ea841c85 100755 --- a/deps/openssl/openssl/util/libeay.num +++ b/deps/openssl/openssl/util/libeay.num @@ -1807,6 +1807,8 @@ ASN1_UTCTIME_get 2350 NOEXIST::FUNCTION: X509_REQ_digest 2362 EXIST::FUNCTION:EVP X509_CRL_digest 2391 EXIST::FUNCTION:EVP ASN1_STRING_clear_free 2392 EXIST::FUNCTION: +SRP_VBASE_get1_by_user 2393 EXIST::FUNCTION:SRP +SRP_user_pwd_free 2394 EXIST::FUNCTION:SRP d2i_ASN1_SET_OF_PKCS7 2397 NOEXIST::FUNCTION: X509_ALGOR_cmp 2398 EXIST::FUNCTION: EVP_CIPHER_CTX_set_key_length 2399 EXIST::FUNCTION: diff --git a/deps/openssl/openssl/util/mk1mf.pl b/deps/openssl/openssl/util/mk1mf.pl index 99652aff918c89..2629a1c5dd645b 100755 --- a/deps/openssl/openssl/util/mk1mf.pl +++ b/deps/openssl/openssl/util/mk1mf.pl @@ -290,6 +290,7 @@ $cflags.=" -DOPENSSL_FIPS" if $fips; $cflags.=" -DOPENSSL_NO_JPAKE" if $no_jpake; $cflags.=" -DOPENSSL_NO_EC2M" if $no_ec2m; +$cflags.=" -DOPENSSL_NO_WEAK_SSL_CIPHERS" if $no_weak_ssl; $cflags.= " -DZLIB" if $zlib_opt; $cflags.= " -DZLIB_SHARED" if $zlib_opt == 2; @@ -482,7 +483,7 @@ # The OpenSSL directory SRC_D=$src_dir -LINK=$link +LINK_CMD=$link LFLAGS=$lflags RSC=$rsc @@ -1205,6 +1206,7 @@ sub read_options "no-jpake" => \$no_jpake, "no-ec2m" => \$no_ec2m, "no-ec_nistp_64_gcc_128" => 0, + "no-weak-ssl-ciphers" => \$no_weak_ssl, "no-err" => \$no_err, "no-sock" => \$no_sock, "no-krb5" => \$no_krb5, diff --git a/deps/openssl/openssl/util/pl/BC-32.pl b/deps/openssl/openssl/util/pl/BC-32.pl index f7161d7bfe2d5b..375b0a76dfcb6a 100644 --- a/deps/openssl/openssl/util/pl/BC-32.pl +++ b/deps/openssl/openssl/util/pl/BC-32.pl @@ -118,7 +118,7 @@ sub do_lib_rule { local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':''; $ex.=' ws2_32.lib gdi32.lib'; - $ret.="\t\$(LINK) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n \$(SHLIB_EX_OBJ) $objs $ex\n<<\n"; + $ret.="\t\$(LINK_CMD) \$(MLFLAGS) $efile$target /def:ms/${Name}.def @<<\n \$(SHLIB_EX_OBJ) $objs $ex\n<<\n"; } $ret.="\n"; return($ret); @@ -132,7 +132,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $files \$(APP_EX_OBJ), $target,, $libs\n\n"; return($ret); } diff --git a/deps/openssl/openssl/util/pl/Mingw32.pl b/deps/openssl/openssl/util/pl/Mingw32.pl index fe3fb27a7860f7..55c85f6447142d 100644 --- a/deps/openssl/openssl/util/pl/Mingw32.pl +++ b/deps/openssl/openssl/util/pl/Mingw32.pl @@ -98,7 +98,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n"; + $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n"; return($ret); } 1; diff --git a/deps/openssl/openssl/util/pl/OS2-EMX.pl b/deps/openssl/openssl/util/pl/OS2-EMX.pl index 28cd1169079ab4..92a332e6e9068a 100644 --- a/deps/openssl/openssl/util/pl/OS2-EMX.pl +++ b/deps/openssl/openssl/util/pl/OS2-EMX.pl @@ -99,7 +99,7 @@ sub do_lib_rule { local($ex)=($target =~ /O_SSL/)?' $(L_CRYPTO)':''; $ex.=' -lsocket'; - $ret.="\t\$(LINK) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n"; + $ret.="\t\$(LINK_CMD) \$(SHLIB_CFLAGS) \$(MLFLAGS) $efile$target \$(SHLIB_EX_OBJ) \$(${Name}OBJ) $ex os2/${Name}.def\n"; $ret.="\temximp -o $out_def/$name.a os2/${Name}.def\n"; $ret.="\temximp -o $out_def/$name.lib os2/${Name}.def\n\n"; } @@ -113,7 +113,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n"; + $ret.="\t\$(LINK_CMD) ${efile}$target \$(CFLAG) \$(LFLAGS) $files $libs\n\n"; return($ret); } diff --git a/deps/openssl/openssl/util/pl/VC-32.pl b/deps/openssl/openssl/util/pl/VC-32.pl index 0f5547f056c2bc..dba96cba5e75d6 100644 --- a/deps/openssl/openssl/util/pl/VC-32.pl +++ b/deps/openssl/openssl/util/pl/VC-32.pl @@ -330,7 +330,7 @@ sub do_lib_rule if ($fips && $target =~ /O_CRYPTO/) { $ret.="$target: $objs \$(PREMAIN_DSO_EXE)"; - $ret.="\n\tSET FIPS_LINK=\$(LINK)\n"; + $ret.="\n\tSET FIPS_LINK=\$(LINK_CMD)\n"; $ret.="\tSET FIPS_CC=\$(CC)\n"; $ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n"; $ret.="\tSET PREMAIN_DSO_EXE=\$(PREMAIN_DSO_EXE)\n"; @@ -344,7 +344,7 @@ sub do_lib_rule else { $ret.="$target: $objs"; - $ret.="\n\t\$(LINK) \$(MLFLAGS) $efile$target $name @<<\n \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n"; + $ret.="\n\t\$(LINK_CMD) \$(MLFLAGS) $efile$target $name @<<\n \$(SHLIB_EX_OBJ) $objs $ex \$(EX_LIBS)\n<<\n"; } $ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;2\n\n"; } @@ -363,7 +363,7 @@ sub do_link_rule { $ret.=" \$(OBJ_D)${o}applink.obj" if $shlib; $ret.="\n"; - $ret.=" \$(LINK) \$(LFLAGS) $efile$target @<<\n\t"; + $ret.=" \$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n\t"; if ($files =~ /O_FIPSCANISTER/ && !$fipscanisterbuild) { $ret.= "\$(EX_LIBS) "; $ret.= "\$(OBJ_D)${o}applink.obj " if $shlib; @@ -373,7 +373,7 @@ sub do_link_rule elsif ($standalone == 2) { $ret.="\n"; - $ret.="\tSET FIPS_LINK=\$(LINK)\n"; + $ret.="\tSET FIPS_LINK=\$(LINK_CMD)\n"; $ret.="\tSET FIPS_CC=\$(CC)\n"; $ret.="\tSET FIPS_CC_ARGS=/Fo\$(OBJ_D)${o}fips_premain.obj \$(SHLIB_CFLAGS) -c\n"; $ret.="\tSET PREMAIN_DSO_EXE=\n"; @@ -386,7 +386,7 @@ sub do_link_rule else { $ret.="\n"; - $ret.="\t\$(LINK) \$(LFLAGS) $efile$target @<<\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $efile$target @<<\n"; $ret.="\t\$(APP_EX_OBJ) $files $libs\n<<\n"; } $ret.="\tIF EXIST \$@.manifest mt -nologo -manifest \$@.manifest -outputresource:\$@;1\n\n"; diff --git a/deps/openssl/openssl/util/pl/linux.pl b/deps/openssl/openssl/util/pl/linux.pl index d24f7b72913cb2..3362941f7bf3a1 100644 --- a/deps/openssl/openssl/util/pl/linux.pl +++ b/deps/openssl/openssl/util/pl/linux.pl @@ -78,7 +78,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n"; + $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n"; return($ret); } diff --git a/deps/openssl/openssl/util/pl/netware.pl b/deps/openssl/openssl/util/pl/netware.pl index fe80a9bb89909e..16f4f4ee37c5a9 100644 --- a/deps/openssl/openssl/util/pl/netware.pl +++ b/deps/openssl/openssl/util/pl/netware.pl @@ -506,22 +506,22 @@ sub do_link_rule if ($gnuc) { $ret.="\t\$(MKLIB) $lib_flags \$(TMP_D)${o}\$(E_EXE).a \$(filter-out \$(TMP_D)${o}\$(E_EXE)${obj},$files)\n"; - $ret.="\t\$(LINK) \$(LFLAGS) $def_file2\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2\n"; $ret.="\t\@$mv \$(E_EXE)2.nlm \$(TEST_D)\n"; } else { - $ret.="\t\$(LINK) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file2 $files \"$prelude\" $libs -o $target2\n"; } } if ($gnuc) { - $ret.="\t\$(LINK) \$(LFLAGS) $def_file\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file\n"; $ret.="\t\@$mv \$(\@F) \$(TEST_D)\n"; } else { - $ret.="\t\$(LINK) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n"; + $ret.="\t\$(LINK_CMD) \$(LFLAGS) $def_file $files \"$prelude\" $libs -o $target\n"; } $ret.="\n"; diff --git a/deps/openssl/openssl/util/pl/ultrix.pl b/deps/openssl/openssl/util/pl/ultrix.pl index ea370c71f9687d..0c76c83b4a73ce 100644 --- a/deps/openssl/openssl/util/pl/ultrix.pl +++ b/deps/openssl/openssl/util/pl/ultrix.pl @@ -31,7 +31,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n"; + $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n"; return($ret); } diff --git a/deps/openssl/openssl/util/pl/unix.pl b/deps/openssl/openssl/util/pl/unix.pl index 1d4e9dc5df1908..8818c5bcb1b249 100644 --- a/deps/openssl/openssl/util/pl/unix.pl +++ b/deps/openssl/openssl/util/pl/unix.pl @@ -164,7 +164,7 @@ sub do_link_rule $file =~ s/\//$o/g if $o ne '/'; $n=&bname($target); $ret.="$target: $files $dep_libs\n"; - $ret.="\t\$(LINK) ${efile}$target \$(LFLAGS) $files $libs\n\n"; + $ret.="\t\$(LINK_CMD) ${efile}$target \$(LFLAGS) $files $libs\n\n"; return($ret); }